carpedm20 commited on Jan 23

Commit

29c669a

verified ·

1 Parent(s): 77ac065

Add Flux2 Klein compiled caches

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
meta.json +44 -0
torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config +1 -0
torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py +70 -0
torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py +297 -0
torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py +45 -0
torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config +1 -0
torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config +1 -0
torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py +28 -0
torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py +357 -0
torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config +1 -0
torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py +30 -0
torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config +1 -0
torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py +33 -0
torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config +1 -0
torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py +30 -0
torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config +1 -0
torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py +78 -0
torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos +0 -0
torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq +0 -0
torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve +0 -0
torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs +0 -0
torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 +0 -0
torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd +0 -0
torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv +0 -0
torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 +0 -0
torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 +0 -0
torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp +0 -0
torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s +0 -0
torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn +0 -0
torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 +0 -0
torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 +0 -0
torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp +0 -0
torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof +0 -0
torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye +0 -0
torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor +0 -0
torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n +0 -0
torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 +0 -0
torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang +0 -0
torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 +0 -0
torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini +0 -0
torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb +0 -0
torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py +73 -0
torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config +1 -0
torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py +69 -0
torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config +1 -0
torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py +162 -0
torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py +131 -0
torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config +1 -0
torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py +25 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf filter=lfs diff=lfs merge=lfs -text

meta.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "cache_layout_version": 1,
+  "created_at": "2026-01-23T07:13:39Z",
+  "model_path": "/root/.cache/huggingface/hub/models--black-forest-labs--FLUX.2-klein-9B/snapshots/cd1bba5810fe2aba6666d9cf7352e25436426039",
+  "compile_command": [
+    "/usr/bin/python",
+    "/app/tensorrt_llm/visual_gen/examples/flux2_klein_9b.py",
+    "--model_path",
+    "/root/.cache/huggingface/hub/models--black-forest-labs--FLUX.2-klein-9B/snapshots/cd1bba5810fe2aba6666d9cf7352e25436426039",
+    "--height",
+    "512",
+    "--width",
+    "1024",
+    "--num_inference_steps",
+    "4",
+    "--num_images",
+    "6",
+    "--linear_type",
+    "te-fp8-per-tensor",
+    "--fallback_linear_type",
+    "default",
+    "--torch_compile_mode",
+    "default",
+    "--offload_text_encoder"
+  ],
+  "height": 512,
+  "width": 1024,
+  "num_inference_steps": 4,
+  "num_images": 6,
+  "linear_type": "te-fp8-per-tensor",
+  "fallback_linear_type": "default",
+  "torch_compile_mode": "default",
+  "offload_text_encoder": true,
+  "offload_vae": false,
+  "disable_cuda_graph": false,
+  "disable_teacache": false,
+  "torch_version": "2.10.0a0+b4e4ee81d3.nv25.12",
+  "cuda_version": "13.1",
+  "device_name": "NVIDIA GeForce RTX 4090",
+  "device_capability": [
+    8,
+    9
+  ]
+}

torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 64, "YBLOCK": 64, "num_warps": 8, "num_stages": 1, "configs_hash": "1ce421918d79ed0f7edb09d0ee64f016daf650a007a21866fe52d592be55380c", "found_by_coordesc": false, "time_taken_ms": 143, "triton_cache_hash": "RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA"}

torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'y': 131072, 'x': 128}, tile_hint=TileHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__fused_rms_norm_cat_view_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'y': 589824, 'x': 75497984}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__fused_rms_norm_cat_view_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+    ynumel = 73728
+    xnumel = 128
+    yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+    xmask = xindex < xnumel
+    y1 = yindex // 32
+    x2 = xindex
+    y0 = (yindex % 32)
+    y3 = yindex
+    tmp0 = y1
+    tmp1 = tl.full([1, 1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1, 1], 256, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (x2 + 128*y0 + 12288*(y1)), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp6 = tmp5.to(tl.float32)
+    tmp7 = tl.load(in_ptr1 + (tl.broadcast_to(y0 + 32*(y1), [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0)
+    tmp8 = 128.0
+    tmp9 = (tmp7 / tmp8)
+    tmp10 = 1e-06
+    tmp11 = tmp9 + tmp10
+    tmp12 = libdevice.rsqrt(tmp11)
+    tmp13 = tmp6 * tmp12
+    tmp14 = tl.load(in_ptr2 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp15 = tmp14.to(tl.float32)
+    tmp16 = tmp13 * tmp15
+    tmp17 = tmp16.to(tl.float32)
+    tmp18 = tl.full(tmp17.shape, 0.0, tmp17.dtype)
+    tmp19 = tl.where(tmp4, tmp17, tmp18)
+    tmp20 = tmp0 >= tmp3
+    tmp21 = tl.full([1, 1], 2304, tl.int64)
+    tmp22 = tmp0 < tmp21
+    tmp23 = tl.load(in_ptr3 + (x2 + 128*y0 + 12288*((-256) + y1)), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp24 = tmp23.to(tl.float32)
+    tmp25 = tl.load(in_ptr4 + (tl.broadcast_to(y0 + 32*((-256) + y1), [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0)
+    tmp26 = 128.0
+    tmp27 = (tmp25 / tmp26)
+    tmp28 = 1e-06
+    tmp29 = tmp27 + tmp28
+    tmp30 = libdevice.rsqrt(tmp29)
+    tmp31 = tmp24 * tmp30
+    tmp32 = tl.load(in_ptr5 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp33 = tmp32.to(tl.float32)
+    tmp34 = tmp31 * tmp33
+    tmp35 = tmp34.to(tl.float32)
+    tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype)
+    tmp37 = tl.where(tmp20, tmp35, tmp36)
+    tmp38 = tl.where(tmp4, tmp19, tmp37)
+    tl.store(out_ptr0 + (x2 + 128*y3), tmp38, xmask & ymask)

torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# AOT ID: ['0_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py
+# Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+# Source node to ATen node mapping:
+#   add => add_1
+#   mul => mul_1
+#   norm_hidden_states => add, convert_element_type, convert_element_type_1, mul, rsqrt, sub, var_mean
+#   norm_hidden_states_1 => add_2
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %getitem_1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=buf1]
+#   %arg2_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %convert_element_type : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {})
+#   %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True})
+#   %add_1 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg1_1, 1), kwargs = {})
+#   %sub : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {})
+#   %add : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
+#   %mul : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
+#   %mul_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_1, %convert_element_type_1), kwargs = {})
+#   %add_2 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %arg2_1), kwargs = {})
+#   return %getitem_1,%buf1,%add_2
+triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 50348032}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2048
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
+''', device_str='cuda')
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py
+# Topologically Sorted Source Nodes: [norm_encoder_hidden_states, add_2, mul_1, norm_encoder_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+# Source node to ATen node mapping:
+#   add_2 => add_4
+#   mul_1 => mul_3
+#   norm_encoder_hidden_states => add_3, convert_element_type_2, convert_element_type_3, mul_2, rsqrt_1, sub_1, var_mean_1
+#   norm_encoder_hidden_states_1 => add_5
+# Graph fragment:
+#   %arg3_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %arg4_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg4_1]
+#   %getitem_3 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=getitem_3]
+#   %buf4 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=buf4]
+#   %arg5_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg5_1]
+#   %convert_element_type_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg3_1, torch.float32), kwargs = {})
+#   %var_mean_1 : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type_2, [2]), kwargs = {correction: 0, keepdim: True})
+#   %add_4 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg4_1, 1), kwargs = {})
+#   %sub_1 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_2, %getitem_3), kwargs = {})
+#   %add_3 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem_2, 1e-06), kwargs = {})
+#   %rsqrt_1 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_3,), kwargs = {})
+#   %mul_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_1, %rsqrt_1), kwargs = {})
+#   %convert_element_type_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_2, torch.bfloat16), kwargs = {})
+#   %mul_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_4, %convert_element_type_3), kwargs = {})
+#   %add_5 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %arg5_1), kwargs = {})
+#   return %getitem_3,%buf4,%add_5
+triton_red_fused_add_mul_native_layer_norm_1 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 6307840}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_1(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2048, 4096), (8388608, 4096, 1))
+        assert_size_stride(arg1_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg2_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg3_1, (1, 256, 4096), (1048576, 4096, 1))
+        assert_size_stride(arg4_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg5_1, (1, 1, 4096), (24576, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf6 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_0.run(arg0_1, arg1_1, arg2_1, buf6, 2048, 4096, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+            buf7 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [norm_encoder_hidden_states, add_2, mul_1, norm_encoder_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_1.run(arg3_1, arg4_1, arg5_1, buf7, 256, 4096, stream=stream0)
+            del arg3_1
+            del arg4_1
+            del arg5_1
+        return (buf6, buf7, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg5_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 67108864},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_mul_silu_split_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 377487360}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_cat_mul_silu_split_view_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 37748736
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 16384)
+    x1 = xindex // 16384
+    x2 = xindex
+    tmp0 = x0
+    tmp1 = tl.full([1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1], 4096, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (4096*x1 + (x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp6 = tmp0 >= tmp3
+    tmp7 = tl.full([1], 16384, tl.int64)
+    tmp8 = tmp0 < tmp7
+    tmp9 = tl.load(in_ptr1 + (36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp10 = tmp9.to(tl.float32)
+    tmp11 = tl.sigmoid(tmp10)
+    tmp12 = tmp10 * tmp11
+    tmp13 = tmp12.to(tl.float32)
+    tmp14 = tl.load(in_ptr1 + (12288 + 36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp15 = tmp13 * tmp14
+    tmp16 = tl.full(tmp15.shape, 0.0, tmp15.dtype)
+    tmp17 = tl.where(tmp6, tmp15, tmp16)
+    tmp18 = tl.where(tmp4, tmp5, tmp17)
+    tl.store(out_ptr0 + (x2), tmp18, None)

torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 81, "triton_cache_hash": "PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ"}

torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A"}

torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 37748736}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 2304)
+    x2 = xindex // 294912
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + ks0*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)

torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# AOT ID: ['25_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py
+# Topologically Sorted Source Nodes: [split, chunk, query_1, query_2, reshape, unbind, key_1, key_2, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.split_with_sizes, aten.split, aten.view, aten._fused_rms_norm, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add]
+# Source node to ATen node mapping:
+#   add => add_2
+#   add_1 => add_3
+#   chunk => split
+#   cos => unsqueeze, unsqueeze_1
+#   cos_2 => unsqueeze_6, unsqueeze_7
+#   float_1 => convert_element_type_4
+#   float_2 => convert_element_type_5
+#   float_3 => convert_element_type_7
+#   float_4 => convert_element_type_8
+#   key_1 => view_1
+#   key_2 => add_1, convert_element_type_2, convert_element_type_3, mean_1, mul_2, mul_3, pow_2, rsqrt_1
+#   mul => mul_4
+#   mul_1 => mul_5
+#   mul_2 => mul_6
+#   mul_3 => mul_7
+#   neg => neg
+#   neg_1 => neg_1
+#   out => convert_element_type_6
+#   out_1 => convert_element_type_9
+#   query_1 => view
+#   query_2 => add, convert_element_type, convert_element_type_1, mean, mul, mul_1, pow_1, rsqrt
+#   reshape => view_3
+#   reshape_1 => view_5
+#   sin => unsqueeze_2, unsqueeze_3
+#   sin_2 => unsqueeze_8, unsqueeze_9
+#   split => split_with_sizes
+#   stack => cat, unsqueeze_4, unsqueeze_5
+#   stack_1 => cat_1, unsqueeze_10, unsqueeze_11
+#   unbind => unbind
+#   unbind_1 => unbind_1
+#   x_rotated => view_4
+#   x_rotated_1 => view_6
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2304, 36864][84934656, 36864, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %buf0 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 73728]cuda:0" = PlaceHolder[target=buf0]
+#   %arg1_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %arg3_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %cat : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0" = PlaceHolder[target=cat]
+#   %arg4_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg4_1]
+#   %buf1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 73728]cuda:0" = PlaceHolder[target=buf1]
+#   %arg2_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %cat_1 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0" = PlaceHolder[target=cat_1]
+#   %split_with_sizes : [num_users=2] = call_function[target=torch.ops.aten.split_with_sizes.default](args = (%arg0_1, [12288, 24576], -1), kwargs = {})
+#   %split : [num_users=3] = call_function[target=torch.ops.aten.split.Tensor](args = (%getitem, 4096, -1), kwargs = {})
+#   %view : Tensor "bf16[1, 2304, 32, 128][84934656, 36864, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%getitem_2, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {})
+#   %pow_1 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {})
+#   %mean : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {})
+#   %add : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
+#   %mul : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
+#   %mul_1 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul, %arg1_1), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
+#   %view_3 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_1, [1, 2304, 32, -1, 2]), kwargs = {})
+#   %unbind : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_3, -1), kwargs = {})
+#   %view_1 : Tensor "bf16[1, 2304, 32, 128][84934656, 36864, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%getitem_3, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {})
+#   %pow_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_2, 2), kwargs = {})
+#   %mean_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_2, [3], True), kwargs = {})
+#   %add_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_1, 1e-06), kwargs = {})
+#   %rsqrt_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_1,), kwargs = {})
+#   %mul_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %rsqrt_1), kwargs = {})
+#   %mul_3 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_2, %arg2_1), kwargs = {})
+#   %convert_element_type_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_3, torch.bfloat16), kwargs = {})
+#   %view_5 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_3, [1, 2304, 32, -1, 2]), kwargs = {})
+#   %unbind_1 : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_5, -1), kwargs = {})
+#   %convert_element_type_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_1, torch.float32), kwargs = {})
+#   %unsqueeze : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg3_1, 0), kwargs = {})
+#   %unsqueeze_1 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze, 2), kwargs = {})
+#   %mul_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %unsqueeze_1), kwargs = {})
+#   %neg : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_6,), kwargs = {})
+#   %unsqueeze_4 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg, 4), kwargs = {})
+#   %unsqueeze_5 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_5, 4), kwargs = {})
+#   %cat : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5], -1), kwargs = {})
+#   %view_4 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_4, torch.float32), kwargs = {})
+#   %unsqueeze_2 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg4_1, 0), kwargs = {})
+#   %unsqueeze_3 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_2, 2), kwargs = {})
+#   %mul_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_5, %unsqueeze_3), kwargs = {})
+#   %add_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_4, %mul_5), kwargs = {})
+#   %convert_element_type_6 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_2, torch.bfloat16), kwargs = {})
+#   %convert_element_type_7 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_3, torch.float32), kwargs = {})
+#   %unsqueeze_6 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg3_1, 0), kwargs = {})
+#   %unsqueeze_7 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_6, 2), kwargs = {})
+#   %mul_6 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_7, %unsqueeze_7), kwargs = {})
+#   %neg_1 : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_8,), kwargs = {})
+#   %unsqueeze_10 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg_1, 4), kwargs = {})
+#   %unsqueeze_11 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_7, 4), kwargs = {})
+#   %cat_1 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_10, %unsqueeze_11], -1), kwargs = {})
+#   %view_6 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_6, torch.float32), kwargs = {})
+#   %unsqueeze_8 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg4_1, 0), kwargs = {})
+#   %unsqueeze_9 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_8, 2), kwargs = {})
+#   %mul_7 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_8, %unsqueeze_9), kwargs = {})
+#   %add_3 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_6, %mul_7), kwargs = {})
+#   %convert_element_type_9 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_3, torch.bfloat16), kwargs = {})
+#   return %buf1,%buf0,%cat,%convert_element_type_6,%cat_1,%convert_element_type_9
+triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 = async_compile.triton('triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 16, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 0, 'r0_': 115606016}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 73728
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x5 = xindex
+    _tmp10 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp6 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+        tmp7 = tmp6.to(tl.float32)
+        tmp8 = tmp7 * tmp7
+        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK])
+        tmp11 = _tmp10 + tmp9
+        _tmp10 = tl.where(r0_mask, tmp11, _tmp10)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tmp10 = tl.sum(_tmp10, 1)[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_3 = (r0_index % 2)
+        r0_4 = r0_index // 2
+        r0_2 = r0_index
+        tmp50 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp58 = tl.load(in_ptr1 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp63 = tl.load(in_ptr2 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp66 = tl.load(in_ptr3 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp96 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp102 = tl.load(in_ptr4 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = r0_3
+        tmp13 = tl.full([1, 1], 0, tl.int64)
+        tmp14 = tmp12 >= tmp13
+        tmp15 = tl.full([1, 1], 1, tl.int64)
+        tmp16 = tmp12 < tmp15
+        tmp17 = tl.load(in_ptr0 + (1 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp19 = 128.0
+        tmp20 = (tmp10 / tmp19)
+        tmp21 = 1e-06
+        tmp22 = tmp20 + tmp21
+        tmp23 = libdevice.rsqrt(tmp22)
+        tmp24 = tmp18 * tmp23
+        tmp25 = tl.load(in_ptr1 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp26 = tmp25.to(tl.float32)
+        tmp27 = tmp24 * tmp26
+        tmp28 = tmp27.to(tl.float32)
+        tmp29 = -tmp28
+        tmp30 = tl.full(tmp29.shape, 0.0, tmp29.dtype)
+        tmp31 = tl.where(tmp16, tmp29, tmp30)
+        tmp32 = tmp12 >= tmp15
+        tmp33 = tl.full([1, 1], 2, tl.int64)
+        tmp34 = tmp12 < tmp33
+        tmp35 = tl.load(in_ptr0 + (2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp36 = tmp35.to(tl.float32)
+        tmp37 = 128.0
+        tmp38 = (tmp10 / tmp37)
+        tmp39 = 1e-06
+        tmp40 = tmp38 + tmp39
+        tmp41 = libdevice.rsqrt(tmp40)
+        tmp42 = tmp36 * tmp41
+        tmp43 = tl.load(in_ptr1 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp44 = tmp43.to(tl.float32)
+        tmp45 = tmp42 * tmp44
+        tmp46 = tmp45.to(tl.float32)
+        tmp47 = tl.full(tmp46.shape, 0.0, tmp46.dtype)
+        tmp48 = tl.where(tmp32, tmp46, tmp47)
+        tmp49 = tl.where(tmp16, tmp31, tmp48)
+        tmp51 = tmp50.to(tl.float32)
+        tmp52 = 128.0
+        tmp53 = (tmp10 / tmp52)
+        tmp54 = 1e-06
+        tmp55 = tmp53 + tmp54
+        tmp56 = libdevice.rsqrt(tmp55)
+        tmp57 = tmp51 * tmp56
+        tmp59 = tmp58.to(tl.float32)
+        tmp60 = tmp57 * tmp59
+        tmp61 = tmp60.to(tl.float32)
+        tmp62 = tmp61.to(tl.float32)
+        tmp64 = tmp62 * tmp63
+        tmp65 = tmp49.to(tl.float32)
+        tmp67 = tmp65 * tmp66
+        tmp68 = tmp64 + tmp67
+        tmp69 = tmp68.to(tl.float32)
+        tmp70 = tl.load(in_ptr0 + (4097 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp71 = tmp70.to(tl.float32)
+        tmp72 = (tmp4 / tmp19)
+        tmp73 = tmp72 + tmp21
+        tmp74 = libdevice.rsqrt(tmp73)
+        tmp75 = tmp71 * tmp74
+        tmp76 = tl.load(in_ptr4 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp77 = tmp76.to(tl.float32)
+        tmp78 = tmp75 * tmp77
+        tmp79 = tmp78.to(tl.float32)
+        tmp80 = -tmp79
+        tmp81 = tl.full(tmp80.shape, 0.0, tmp80.dtype)
+        tmp82 = tl.where(tmp16, tmp80, tmp81)
+        tmp83 = tl.load(in_ptr0 + (4096 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp84 = tmp83.to(tl.float32)
+        tmp85 = (tmp4 / tmp37)
+        tmp86 = tmp85 + tmp39
+        tmp87 = libdevice.rsqrt(tmp86)
+        tmp88 = tmp84 * tmp87
+        tmp89 = tl.load(in_ptr4 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp90 = tmp89.to(tl.float32)
+        tmp91 = tmp88 * tmp90
+        tmp92 = tmp91.to(tl.float32)
+        tmp93 = tl.full(tmp92.shape, 0.0, tmp92.dtype)
+        tmp94 = tl.where(tmp32, tmp92, tmp93)
+        tmp95 = tl.where(tmp16, tmp82, tmp94)
+        tmp97 = tmp96.to(tl.float32)
+        tmp98 = (tmp4 / tmp52)
+        tmp99 = tmp98 + tmp54
+        tmp100 = libdevice.rsqrt(tmp99)
+        tmp101 = tmp97 * tmp100
+        tmp103 = tmp102.to(tl.float32)
+        tmp104 = tmp101 * tmp103
+        tmp105 = tmp104.to(tl.float32)
+        tmp106 = tmp105.to(tl.float32)
+        tmp107 = tmp106 * tmp63
+        tmp108 = tmp95.to(tl.float32)
+        tmp109 = tmp108 * tmp66
+        tmp110 = tmp107 + tmp109
+        tmp111 = tmp110.to(tl.float32)
+        tl.store(in_out_ptr0 + (r0_2 + 128*x5), tmp69, r0_mask)
+        tl.store(in_out_ptr1 + (r0_2 + 128*x5), tmp111, r0_mask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2304, 36864), (84934656, 36864, 1))
+        assert_size_stride(arg1_1, (128, ), (1, ))
+        assert_size_stride(arg2_1, (128, ), (1, ))
+        assert_size_stride(arg3_1, (2304, 128), (128, 1))
+        assert_size_stride(arg4_1, (2304, 128), (128, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf2 = empty_strided_cuda((1, 2304, 32, 64, 2), (9437184, 4096, 128, 2, 1), torch.bfloat16)
+            buf3 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2  # reuse
+            buf4 = empty_strided_cuda((1, 2304, 32, 64, 2), (9437184, 4096, 128, 2, 1), torch.bfloat16)
+            buf5 = reinterpret_tensor(buf4, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf4  # reuse
+            # Topologically Sorted Source Nodes: [split, chunk, query_1, query_2, reshape, unbind, key_1, key_2, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.split_with_sizes, aten.split, aten.view, aten._fused_rms_norm, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add]
+            stream0 = get_raw_stream(0)
+            triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.run(buf3, buf5, arg0_1, arg1_1, arg3_1, arg4_1, arg2_1, 73728, 128, stream=stream0)
+            del arg1_1
+            del arg2_1
+            del arg3_1
+            del arg4_1
+        return (buf3, buf5, reinterpret_tensor(arg0_1, (1, 2304, 32, 128), (84934656, 36864, 128, 1), 8192), reinterpret_tensor(arg0_1, (1, 2304, 24576), (84934656, 36864, 1), 12288), )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 36864), (84934656, 36864, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32)
+    arg4_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 35, "triton_cache_hash": "Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q"}

torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 1048576},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 8396800}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 1048576
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)

torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 68, "triton_cache_hash": "6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA"}

torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 201326592}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 25165824
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 12288)
+    x1 = xindex // 12288
+    x2 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(out_ptr0 + (x2), tmp6, None)

torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 45, "triton_cache_hash": "EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ"}

torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 67117056}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 8388608
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)

torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1, "R0_BLOCK": 2048, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 35, "triton_cache_hash": "H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ"}

torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 12607488}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp3 = tmp1 * tmp2
+        tmp4 = tmp0 + tmp3
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
+        tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
+            tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
+        )
+        tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
+        tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
+        tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
+        tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask)
+    tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
+    tmp7 = tmp8[:, None]
+    tmp11 = tmp9[:, None]
+    tmp12 = tmp10[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp14 = tmp13.to(tl.float32)
+        tmp15 = tmp14 - tmp7
+        tmp16 = 4096.0
+        tmp17 = (tmp11 / tmp16)
+        tmp18 = 1e-06
+        tmp19 = tmp17 + tmp18
+        tmp20 = libdevice.rsqrt(tmp19)
+        tmp21 = tmp15 * tmp20
+        tmp22 = tmp21.to(tl.float32)
+        tmp24 = 1.0
+        tmp25 = tmp23 + tmp24
+        tmp26 = tmp22 * tmp25
+        tmp28 = tmp26 + tmp27
+        tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask)

torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos ADDED Viewed

Binary file (52.3 kB). View file

torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq ADDED Viewed

Binary file (54.7 kB). View file

torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve ADDED Viewed

Binary file (54.9 kB). View file

torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs ADDED Viewed

Binary file (74.3 kB). View file

torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 ADDED Viewed

Binary file (54.7 kB). View file

torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd ADDED Viewed

Binary file (55.1 kB). View file

torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv ADDED Viewed

Binary file (55.1 kB). View file

torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 ADDED Viewed

Binary file (55.1 kB). View file

torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 ADDED Viewed

Binary file (54.6 kB). View file

torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp ADDED Viewed

Binary file (62.5 kB). View file

torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s ADDED Viewed

Binary file (83.4 kB). View file

torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn ADDED Viewed

Binary file (54.4 kB). View file

torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 ADDED Viewed

Binary file (52 kB). View file

torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 ADDED Viewed

Binary file (52.7 kB). View file

torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp ADDED Viewed

Binary file (56.9 kB). View file

torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof ADDED Viewed

Binary file (54.7 kB). View file

torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye ADDED Viewed

Binary file (55.1 kB). View file

torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor ADDED Viewed

Binary file (55.1 kB). View file

torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n ADDED Viewed

Binary file (56.1 kB). View file

torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 ADDED Viewed

Binary file (55.1 kB). View file

torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang ADDED Viewed

Binary file (52.8 kB). View file

torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 ADDED Viewed

Binary file (59.5 kB). View file

torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini ADDED Viewed

Binary file (56.5 kB). View file

torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb ADDED Viewed

Binary file (62.7 kB). View file

torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 6307840}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_1(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)

torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 33, "triton_cache_hash": "CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA"}

torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# AOT ID: ['1_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (4096, 12288), (1, 4096))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (12288, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((4096, 12288), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 2, "R0_BLOCK": 128, "num_warps": 2, "num_stages": 1, "configs_hash": "6ffa43f2ca8cb1499f3ff3fbf8c975f2c07eef9b57fcecda113029ab12cbef66", "found_by_coordesc": false, "time_taken_ms": 307, "triton_cache_hash": "AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A"}

torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 16, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 0, 'r0_': 115606016}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 73728
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x5 = xindex
+    _tmp10 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp6 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+        tmp7 = tmp6.to(tl.float32)
+        tmp8 = tmp7 * tmp7
+        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK])
+        tmp11 = _tmp10 + tmp9
+        _tmp10 = tl.where(r0_mask, tmp11, _tmp10)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tmp10 = tl.sum(_tmp10, 1)[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_3 = (r0_index % 2)
+        r0_4 = r0_index // 2
+        r0_2 = r0_index
+        tmp50 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp58 = tl.load(in_ptr1 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp63 = tl.load(in_ptr2 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp66 = tl.load(in_ptr3 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp96 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp102 = tl.load(in_ptr4 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = r0_3
+        tmp13 = tl.full([1, 1], 0, tl.int64)
+        tmp14 = tmp12 >= tmp13
+        tmp15 = tl.full([1, 1], 1, tl.int64)
+        tmp16 = tmp12 < tmp15
+        tmp17 = tl.load(in_ptr0 + (1 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp19 = 128.0
+        tmp20 = (tmp10 / tmp19)
+        tmp21 = 1e-06
+        tmp22 = tmp20 + tmp21
+        tmp23 = libdevice.rsqrt(tmp22)
+        tmp24 = tmp18 * tmp23
+        tmp25 = tl.load(in_ptr1 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp26 = tmp25.to(tl.float32)
+        tmp27 = tmp24 * tmp26
+        tmp28 = tmp27.to(tl.float32)
+        tmp29 = -tmp28
+        tmp30 = tl.full(tmp29.shape, 0.0, tmp29.dtype)
+        tmp31 = tl.where(tmp16, tmp29, tmp30)
+        tmp32 = tmp12 >= tmp15
+        tmp33 = tl.full([1, 1], 2, tl.int64)
+        tmp34 = tmp12 < tmp33
+        tmp35 = tl.load(in_ptr0 + (2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp36 = tmp35.to(tl.float32)
+        tmp37 = 128.0
+        tmp38 = (tmp10 / tmp37)
+        tmp39 = 1e-06
+        tmp40 = tmp38 + tmp39
+        tmp41 = libdevice.rsqrt(tmp40)
+        tmp42 = tmp36 * tmp41
+        tmp43 = tl.load(in_ptr1 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp44 = tmp43.to(tl.float32)
+        tmp45 = tmp42 * tmp44
+        tmp46 = tmp45.to(tl.float32)
+        tmp47 = tl.full(tmp46.shape, 0.0, tmp46.dtype)
+        tmp48 = tl.where(tmp32, tmp46, tmp47)
+        tmp49 = tl.where(tmp16, tmp31, tmp48)
+        tmp51 = tmp50.to(tl.float32)
+        tmp52 = 128.0
+        tmp53 = (tmp10 / tmp52)
+        tmp54 = 1e-06
+        tmp55 = tmp53 + tmp54
+        tmp56 = libdevice.rsqrt(tmp55)
+        tmp57 = tmp51 * tmp56
+        tmp59 = tmp58.to(tl.float32)
+        tmp60 = tmp57 * tmp59
+        tmp61 = tmp60.to(tl.float32)
+        tmp62 = tmp61.to(tl.float32)
+        tmp64 = tmp62 * tmp63
+        tmp65 = tmp49.to(tl.float32)
+        tmp67 = tmp65 * tmp66
+        tmp68 = tmp64 + tmp67
+        tmp69 = tmp68.to(tl.float32)
+        tmp70 = tl.load(in_ptr0 + (4097 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp71 = tmp70.to(tl.float32)
+        tmp72 = (tmp4 / tmp19)
+        tmp73 = tmp72 + tmp21
+        tmp74 = libdevice.rsqrt(tmp73)
+        tmp75 = tmp71 * tmp74
+        tmp76 = tl.load(in_ptr4 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp77 = tmp76.to(tl.float32)
+        tmp78 = tmp75 * tmp77
+        tmp79 = tmp78.to(tl.float32)
+        tmp80 = -tmp79
+        tmp81 = tl.full(tmp80.shape, 0.0, tmp80.dtype)
+        tmp82 = tl.where(tmp16, tmp80, tmp81)
+        tmp83 = tl.load(in_ptr0 + (4096 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp84 = tmp83.to(tl.float32)
+        tmp85 = (tmp4 / tmp37)
+        tmp86 = tmp85 + tmp39
+        tmp87 = libdevice.rsqrt(tmp86)
+        tmp88 = tmp84 * tmp87
+        tmp89 = tl.load(in_ptr4 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp90 = tmp89.to(tl.float32)
+        tmp91 = tmp88 * tmp90
+        tmp92 = tmp91.to(tl.float32)
+        tmp93 = tl.full(tmp92.shape, 0.0, tmp92.dtype)
+        tmp94 = tl.where(tmp32, tmp92, tmp93)
+        tmp95 = tl.where(tmp16, tmp82, tmp94)
+        tmp97 = tmp96.to(tl.float32)
+        tmp98 = (tmp4 / tmp52)
+        tmp99 = tmp98 + tmp54
+        tmp100 = libdevice.rsqrt(tmp99)
+        tmp101 = tmp97 * tmp100
+        tmp103 = tmp102.to(tl.float32)
+        tmp104 = tmp101 * tmp103
+        tmp105 = tmp104.to(tl.float32)
+        tmp106 = tmp105.to(tl.float32)
+        tmp107 = tmp106 * tmp63
+        tmp108 = tmp95.to(tl.float32)
+        tmp109 = tmp108 * tmp66
+        tmp110 = tmp107 + tmp109
+        tmp111 = tmp110.to(tl.float32)
+        tl.store(in_out_ptr0 + (r0_2 + 128*x5), tmp69, r0_mask)
+        tl.store(in_out_ptr1 + (r0_2 + 128*x5), tmp111, r0_mask)

torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# AOT ID: ['21_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py
+# Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul]
+# Source node to ATen node mapping:
+#   chunk => split
+#   silu => convert_element_type, convert_element_type_1, mul_6, sigmoid
+#   x => mul_10
+# Graph fragment:
+#   %arg1_1 : Tensor "bf16[1, s67, 24576][24576*s67, 24576, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg1_1, 12288, -1), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {})
+#   %sigmoid : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {})
+#   %mul_6 : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_6, torch.bfloat16), kwargs = {})
+#   %mul_10 : Tensor "bf16[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {})
+#   return %mul_10
+triton_poi_fused_mul_silu_split_0 = async_compile.triton('triton_poi_fused_mul_silu_split_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 4194304},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 12288)
+    x1 = xindex // 12288
+    x2 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(out_ptr0 + (x2), tmp6, None)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        s67 = arg0_1
+        assert_size_stride(arg1_1, (1, s67, 24576), (24576*s67, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, s67, 12288), (12288*s67, 12288, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul]
+            triton_poi_fused_mul_silu_split_0_xnumel = 12288*s67
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_mul_silu_split_0.run(arg1_1, buf0, triton_poi_fused_mul_silu_split_0_xnumel, stream=stream0)
+            del arg1_1
+        return (buf0, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 256
+    arg1_1 = rand_strided((1, 256, 24576), (6291456, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 41, "triton_cache_hash": "SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA"}

torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 50331648}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 8388608
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
+    tl.store(out_ptr0 + (x0), tmp0, None)