Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

Jackoatmon commited on Apr 23

Commit

0c3474d

verified ·

1 Parent(s): 71240f7

Refresh strict runtime image on feather-runtime

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +15 -12
overlay/.dockerignore +20 -0
overlay/htm_rust/bench_gpu.py +81 -0
overlay/htm_rust/build.rs +6 -12
overlay/htm_rust/docs/GPU_HTM.md +302 -0
overlay/htm_rust/src/gpu/fused.rs +19 -32
overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu +77 -77
overlay/htm_rust/uv.lock +8 -0
overlay/hydra/__init__.py +10 -0
overlay/hydra/config.py +104 -1
overlay/hydra/data_module.py +288 -0
overlay/hydra/diffusion_loss.py +236 -0
overlay/hydra/engram.py +131 -29
overlay/hydra/eval.py +8 -238
overlay/hydra/gdn_block.py +126 -0
overlay/hydra/hyena_block.py +68 -0
overlay/hydra/lightning_module.py +326 -0
overlay/hydra/model.py +269 -59
overlay/hydra/training.py +406 -147
overlay/kernels/__init__.py +0 -0
overlay/kernels/cuda/decode_kernels.cu +10 -0
overlay/kernels/cuda/flashfftconv/LICENSE +201 -0
overlay/kernels/cuda/flashfftconv/README.md +57 -0
overlay/kernels/cuda/flashfftconv/UPSTREAM_COMMIT +1 -0
overlay/kernels/cuda/flashfftconv/csrc/.gitignore +10 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly.h +374 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda.cu +699 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda_bf16.cu +725 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda.cu +723 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda_bf16.cu +705 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda.cu +871 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda_bf16.cu +897 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda.cu +905 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda_bf16.cu +917 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/shared.h +60 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d.h +96 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bhl.cu +132 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_blh.cu +202 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bwd_cuda_bhl.cu +106 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bwd_cuda_blh.cu +116 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/shared.h +168 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch.cpp +61 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_bwd_complex_kernel_bf16.h +672 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_bwd_kernel_bf16.h +828 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_complex_kernel_bf16.h +611 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_kernel_bf16.h +639 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_bwd_complex_kernel_bf16.h +746 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_bwd_kernel_bf16.h +877 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_complex_kernel_bf16.h +741 -0
overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_kernel_bf16.h +769 -0

Dockerfile CHANGED Viewed

@@ -88,13 +88,17 @@ RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
 # Optional tilelang for MIMO path — pure-python, cheap; SISO Mamba3 works without.
 RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed — continuing"
-# Triton version decision: FORCE 3.5.1. Some wheels/builders may not expose
-# every optional symbol at build time; we log capability checks but do not fail
-# image build here because runtime on A10 uses inert/fastpath guards.
-RUN pip install --force-reinstall --no-deps 'triton==3.5.1' && \
-    python -c "import triton; from triton import language as tl; \
-               sa=hasattr(triton, 'set_allocator'); td=hasattr(tl, 'make_tensor_descriptor'); \
-               print(f'triton={triton.__version__} set_allocator={sa} make_tensor_descriptor={td}')"
 WORKDIR /workspace
 COPY overlay /workspace/feather
@@ -104,10 +108,9 @@ WORKDIR /workspace/feather
 RUN python -m py_compile hydra/training.py prepare.py train.py && \
     bash -n scripts/run_domain_expanded_pretrain.sh
-RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
-    export HTM_CUDA_ARCH=${HTM_CUDA_ARCH:-sm_86} && \
-    (maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml || \
-     maturin build --release --manifest-path htm_rust/Cargo.toml) && \
-    pip install htm_rust/target/wheels/htm_rust-*.whl
 CMD ["python", "/app/entrypoint.py"]

 # Optional tilelang for MIMO path — pure-python, cheap; SISO Mamba3 works without.
 RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed — continuing"
+# Triton version decision: FORCE 3.5.1 — the only version with both mamba3
+# APIs (set_allocator + tl.make_tensor_descriptor). torch 2.6's _inductor
+# imports AttrsDescriptor from triton.compiler.compiler which was removed in
+# triton 3.4+, but mamba_ssm/__init__.py shims AttrsDescriptor as a stub
+# before any torch._inductor import path runs, so the incompatibility is
+# neutralized. Build-time assert verifies mamba3's two required APIs.
+RUN pip install --force-reinstall --no-deps 'triton==3.5.1' && \
+    python -c "import triton; from triton import language as tl; \
+               assert hasattr(triton, 'set_allocator'), 'missing triton.set_allocator'; \
+               assert hasattr(tl, 'make_tensor_descriptor'), 'missing tl.make_tensor_descriptor'; \
+               print(f'triton={triton.__version__} set_allocator+make_tensor_descriptor OK, AttrsDescriptor shimmed in mamba_ssm/__init__.py')"
 WORKDIR /workspace
 COPY overlay /workspace/feather
 RUN python -m py_compile hydra/training.py prepare.py train.py && \
     bash -n scripts/run_domain_expanded_pretrain.sh
+RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
+    export HTM_CUDA_ARCH=sm_90 && \
+    maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml && \
+    pip install htm_rust/target/wheels/htm_rust-*.whl
 CMD ["python", "/app/entrypoint.py"]

overlay/.dockerignore ADDED Viewed

	@@ -0,0 +1,20 @@

+.git
+.github
+.venv
+.remember
+.letta
+.claude
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+run_*.log
+run*.log
+*.txt
+WORKER_COMPLETE
+autoresearch_loop.log
+data/
+state_store/
+htm_rust/target/
+hydra-core/target/

overlay/htm_rust/bench_gpu.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes.
+Usage:
+    source .venv/bin/activate
+    export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+    python htm_rust/bench_gpu.py
+"""
+import os
+import sys
+import time
+# Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports.
+_FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _FEATHER not in sys.path:
+    sys.path.insert(0, _FEATHER)
+import numpy as np
+import torch
+from subsystems.htm import HTMLayer
+def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float:
+    """Return mean ms/forward."""
+    for _ in range(warmup):
+        _ = layer(sdr)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    for _ in range(iters):
+        _ = layer(sdr)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    dt = time.perf_counter() - t0
+    return dt * 1000 / iters
+def main() -> None:
+    # HYDRA training config: B=8, T=2048, bits=16384, cols=2048.
+    B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384
+    n_cols = 2048
+    print(f"config: B={B} T={T} D={D} n_cols={n_cols}")
+    print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}")
+    # Build a fixed sparse SDR once.
+    rng = np.random.default_rng(0)
+    sdr = np.zeros((B, T, D), dtype=bool)
+    on = int(D * 0.02)
+    for b in range(B):
+        for t in range(T):
+            idx = rng.choice(D, size=on, replace=False)
+            sdr[b, t, idx] = True
+    sdr_t = torch.from_numpy(sdr)
+    # CPU baseline.
+    print("\n--- CPU ---")
+    cpu_layer = HTMLayer(
+        input_bits=D, n_columns=n_cols, cells_per_column=32,
+        batch_size=B, seed=42, use_gpu=False,
+    )
+    cpu_layer.train()
+    cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2)
+    print(f"CPU: {cpu_ms:.1f} ms/forward  ({cpu_ms/T:.2f} ms/step × T={T})")
+    # GPU.
+    print("\n--- GPU ---")
+    gpu_layer = HTMLayer(
+        input_bits=D, n_columns=n_cols, cells_per_column=32,
+        batch_size=B, seed=42, use_gpu=True,
+    )
+    gpu_layer.train()
+    sdr_cuda = sdr_t.cuda()
+    gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2)
+    print(f"GPU: {gpu_ms:.1f} ms/forward  ({gpu_ms/T:.2f} ms/step × T={T})")
+    print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x")
+if __name__ == "__main__":
+    main()

overlay/htm_rust/build.rs CHANGED Viewed

@@ -26,11 +26,8 @@ fn main() {
         return;
     }
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
-    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
-    // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
-    let base_kernels: &[&str] = &[
         "sp_overlap",
         "sp_topk",
         "sp_learn",
@@ -43,20 +40,17 @@ fn main() {
         "tm_grow",
         "tm_anomaly",
         "tm_reset",
     ];
-    // htm_fused_step now compiles for ALL architectures (sm_80+).
-    // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
-    // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
-    // with grid.sync() for cross-block synchronization (cooperative launch).
-    let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
     let kernels_dir = PathBuf::from("src/gpu/kernels");
-    for k in &kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         println!("cargo:rerun-if-changed={}", src.display());
     }
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");

         return;
     }
+    // Kernels to compile. Each .cu file → one .ptx file, embedded by name.
+    let kernels: &[&str] = &[
         "sp_overlap",
         "sp_topk",
         "sp_learn",
         "tm_grow",
         "tm_anomaly",
         "tm_reset",
+        "htm_fused_step",
     ];
     let kernels_dir = PathBuf::from("src/gpu/kernels");
+    for k in kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         println!("cargo:rerun-if-changed={}", src.display());
     }
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
+    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");

overlay/htm_rust/docs/GPU_HTM.md ADDED Viewed

	@@ -0,0 +1,302 @@

+# GPU HTM Backend
+## Status
+**FUSED MEGAKERNEL: entire T-timestep SP+TM forward collapsed into a single
+CUDA launch per forward pass.**
+* Legacy path: 12 kernels × T=2048 timesteps = 24K launches per forward.
+* Fused path: **1 launch per forward** (24000× launch-overhead reduction).
+* End-to-end training throughput: **~2.7k → ~60k tok/sec** (~22x speedup).
+* Fused path uses per-column threshold inhibition instead of global top-K
+  (see §Fused Kernel below — this is a real architectural change).
+## Fused Kernel
+### Why
+Global top-K column selection requires cross-block synchronization at every
+timestep. On WSL2/sm_86 without `-rdc=true`, `cooperative_groups::grid_sync()`
+is unreliable. Without a grid sync, collapsing the T-loop into one kernel is
+impossible, so every forward pays 12×T kernel launches and 90%+ of runtime is
+CUDA launch overhead + small-kernel tails.
+### How
+Replace global top-K with **per-column threshold activation**:
+    is_active[c] = (overlap[c] * boost[c]) > inhibition_threshold[c]
+`inhibition_threshold[c]` is a per-column scalar, learned via EMA update:
+    err = active_duty[c] - sparsity_target
+    new_thr = clamp(thr + thr_adapt_rate * err * 100, 0.1, 1000)
+This is biologically grounded (GABAergic local lateral inhibition in
+neocortical columns) and supported by HTM theory. The duty-cycle-driven
+feedback loop was already present; we simply redirect its output to drive
+activation threshold instead of multiplicative boost. The global top-K,
+which had no biological basis, is removed.
+### Cross-block coherence
+- **Ping-pong bitsets** for `cell_active_bits` and `cell_winner_bits`: at
+  even t write to `_a`, read from `_b`; at odd t reversed. This eliminates
+  the need for an in-place snapshot kernel between timesteps.
+- **Primary path: cooperative launch + hardware grid sync**. Host code probes
+  `CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH`, computes the cooperative whole-grid
+  residency limit from occupancy, and launches the fused megakernel with
+  `cuLaunchCooperativeKernel`. In-kernel barriers use
+  `cooperative_groups::this_grid().sync()`.
+- **Fallback path: software grid barrier** via a 3-slot atomic counter array
+  (`barrier_counters`). This remains as a compatibility fallback when
+  cooperative launch is unavailable.
+- **Launch invariant**: cooperative launch is capped to the hardware residency
+  limit for `blockDim.x = 1024`; software fallback remains capped conservatively
+  (`HTM_FUSED_GRID_CAP`, default 8) to avoid whole-grid spin deadlock.
+### Kernel structure
+```
+for t in 0..T:
+    # Phase 0: clear curr_active/curr_winner for my column range
+    grid_barrier()
+    # Phase A: SP overlap → boost → threshold → SP learn → duty + threshold EMA
+    grid_barrier()
+    # Phase B: TM predict (per cell, per seg) → TM learn (reinforce on match)
+    #                   → burst if none predicted → segment grow/reinforce
+    grid_barrier()
+    # Phase C: block 0 writes anomaly[t]
+```
+Each warp owns a contiguous slice of columns. At grid=24 blocks × 32 warps =
+768 warps, n_columns=2048 → 2-3 columns per warp.
+### Parity with legacy GPU path
+**Semantics diverge**. Legacy: exactly `k = round(sparsity * n_cols)` columns
+active per step. Fused: variable, converging to `sparsity * n_cols` on
+average via the per-column EMA. Anomaly decay on repeating sequences is
+preserved (see `gpu_fused_tm_anomaly_decays_on_repeating_sequence` test).
+This is an intentional architectural change committed under
+`no-bypass/full-architecture` per program.md rules. The legacy top-K path
+(`step_many_cuda`) remains available for reference and can be re-enabled via
+`HYDRA_HTM_FUSED=0`.
+### Tests
+- `gpu_threshold_converges_to_sparsity` (tests.rs): 1000-step warmup on
+  random SDRs, then measure mean active cols/step on next 200 steps. Must
+  land within [0.25×, 4×] of `sparsity_target * n_cols`.
+- `gpu_fused_tm_anomaly_decays_on_repeating_sequence`: feed A,B,C repeating
+  for 300 steps. Late anomaly must be < early anomaly AND < 0.5.
+## Legacy Pipeline (kept for fallback)
+* SP: 5 kernels, bit-identical parity with CPU under strict-parity mode.
+* TM: 7 kernels, relaxed-parity with CPU.
+* Speedup at training size (B=8, T=2048, bits=16384): **3.83x** vs CPU.
+## Building
+CPU-only (default, zero CUDA dep):
+```bash
+cargo build --release
+```
+GPU-enabled:
+```bash
+export PATH=/usr/local/cuda-12.1/bin:$PATH
+export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+export HTM_PTX_VERSION=7.8   # lower if driver older than nvcc
+cargo build --release --features gpu
+cargo test  --release --features gpu --lib   # fused path includes cooperative launch + grid-sync tests
+# Python wheel:
+maturin develop --release --features gpu --manifest-path htm_rust/Cargo.toml
+```
+## Architecture
+### Module layout
+```
+src/gpu/
+  mod.rs            # HTMRegionGpu pyclass + step_many_gpu (full pipeline)
+  sp_gpu.rs         # Persistent SP device buffers + step_batch_with_tm
+  tm_gpu.rs         # Persistent TM device buffers + step (predict→activate→learn)
+  tests.rs          # CPU-vs-GPU SP parity + end-to-end TM anomaly decay
+  kernels/
+    sp_overlap.cu       # per-column overlap reduction
+    sp_topk.cu          # k-WTA top-K winner selection
+    sp_learn.cu         # Hebbian +inc/-dec on proximal synapses
+    sp_duty.cu          # EMA duty-cycle update
+    sp_boost_fused.cu   # fused mean + exp boost (GPU-side)
+    tm_reset.cu         # per-step: snapshot active→prev, clear buffers
+    tm_predict.cu       # per-cell: score owned segments vs prev_active_bits
+    tm_activate.cu      # per-col: activate predicted cells OR burst
+    tm_learn.cu         # per-cell: reinforce correctly-predicted segments
+    tm_punish.cu        # per-cell: decay matching segs on inactive cols
+    tm_grow.cu          # per-bursting-col: reuse matching seg OR create new,
+                        #                    grow synapses to prev_winners
+    tm_anomaly.cu       # per-step: unpredicted/active ratio
+```
+### Persistent SP state (per region, unchanged from Phase 1)
+At n_cols=2048, S=40, bits=16384: ~355 KB persistent + ~90 KB transient.
+### Persistent TM state (per region)
+Capacity knobs (configured in `tm_gpu.rs`):
+- `MAX_SEGMENTS_PER_CELL = 4`
+- `MAX_SYN_PER_SEGMENT   = 20`
+At cells_per_col=32, n_cols=2048:
+- `n_cells          = 65_536`
+- `n_segments_max   = 262_144`   (~262K)
+- `n_synapses_max   = 5_242_880` (~5.2M)
+| Buffer                | Shape / type         | Notes                                  |
+|-----------------------|----------------------|----------------------------------------|
+| `seg_cell_id`         | (n_segs,) u32        | owning cell; U32_MAX = unused          |
+| `seg_syn_count`       | (n_segs,) u32        | #active synapses in slot               |
+| `syn_presyn`          | (n_segs × S,) u32    | presynaptic cell indices               |
+| `syn_perm`            | (n_segs × S,) i16    | permanence scaled 0..32767 (0.0..1.0)  |
+| `cell_seg_count`      | (n_cells,) u32       | segments allocated on each cell        |
+| `cell_active_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
+| `cell_winner_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
+| `cell_predictive_bits`| (n_cells/32,) u32    | set by predict, read by activate       |
+| `prev_active_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
+| `prev_winner_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
+| `col_predicted`       | (n_cols,) u8         | set if any cell in col is predictive   |
+| `col_best_match`      | (n_cols,) u32        | packed (pot<<21 | seg_id), atomicMax  |
+| `seg_num_active_conn` | (n_segs,) u32        | output of predict                      |
+| `seg_num_active_pot`  | (n_segs,) u32        | output of predict                      |
+| `unpredicted_count`   | (1,) u32             | atomic counter for anomaly             |
+| `burst_cols_flat`     | (n_cols,) u32        | list of bursting cols                  |
+| `burst_cols_count`    | (1,) u32             | length of above list                   |
+**Total per TM region: ~42 MB.** Batch of 8 regions: ~340 MB. Fits 6 GB RTX 3060.
+### Per-step pipeline (single iteration of `step_batch_with_tm`)
+```
+  SP side                            TM side
+  ---------                          ---------
+  1. D2D input slice → inp_dev
+  2. sp_overlap (n_cols blocks)
+  3. sp_topk    (1 block)
+  4. sp_learn   (n_cols blocks)
+  5. sp_duty    (n_cols/256 blocks)
+  6. sp_boost_fused (1 block)
+  7. D2D active_mask → cols_dev[ti]
+                                     8. tm_reset_step   (ceil(n_cells/32/256))
+                                     9. tm_predict      (n_cells blocks × 32 thr)
+                                    10. tm_activate     (n_cols/256 blocks)
+                                    11. tm_anomaly      (1 block)
+                                    if learn:
+                                    12. tm_learn        (n_cells blocks)
+                                    13. tm_punish       (n_cells blocks)
+                                    14. tm_grow         (n_cols blocks — early-exits)
+```
+No host sync in the T-step loop. At the end one `dtoh_sync_copy` each for
+`cols_dev` (T × n_cols bytes) and `anom_dev` (T × f32).
+## Parity
+### SP: strict bit-identical
+See Phase 1 docs — `gpu_sp_matches_cpu_with_learn` over 50 steps passes exact.
+### TM: relaxed-parity
+The GPU TM has known, deliberate deviations from CPU to admit massive parallelism:
+1. **Bursting winner cell**: CPU picks the least-used cell (fewest segments) with
+   random tiebreak. GPU picks cell 0 of the column (deterministic, branch-free).
+   Learning dynamics are preserved because segment creation/reinforcement is
+   the dominant effect, not which specific cell in a bursting column wins.
+2. **Permanence storage**: i16 fixed-point (scale 32767) vs f32. Rounding
+   differs by <=1 ULP of the scale (~3.0e-5), below any meaningful learning
+   quantum (inc=0.10, dec=0.10, predicted_segment_dec=0.10).
+3. **Grown synapse candidate order**: CPU randomly samples from prev_winner_cells.
+   GPU iterates prev_winner_bits words in a pseudo-random rotated order keyed
+   by (bursting_col_idx, iter_seed). Output is a different subset but same size.
+4. **Segment LRU eviction**: CPU tracks `last_used_iteration` per segment.
+   GPU wraps around (slot = count % max_segments_per_cell). In the autoresearch
+   loop where TM resets every forward, eviction rarely triggers.
+The GPU parity test (`gpu_tm_anomaly_decays_on_repeating_sequence`) feeds a
+repeating A,B,C sequence and asserts anomaly decays: **1.000 early → 0.000 late**.
+## Bottleneck Analysis
+| Source                           | Cost/step (B=8 T=2048)   |
+|----------------------------------|-------------------------:|
+| 14 kernel launches               | ~70 μs                   |
+| ~262K predict/learn/punish blocks| ~2.5 ms                  |
+| No D2H until end-of-batch        | 0 μs                     |
+| Final D2H (T × n_cols + T × f32) | ~200 μs per region       |
+Per-step wall time at B=8 T=2048:
+- CPU (reference): **~11.4 ms / step**
+- GPU (current):   **~2.98 ms / step**
+- **Speedup: 3.83x**
+## End-to-End Training Benchmark
+**Config**: B=8, T=2048, vocab=8192, 60-second time budget, full HYDRA stack
+(SDR Semantic + HTM + Mamba-3 + Engram + mHC + Hestia QAT).
+**Results**:
+- GPU util: **97-98% sustained**
+- VRAM: **5.4 GB / 6.0 GB** (90% utilisation)
+- Steps completed: 16
+- tok/sec: **~2,200-2,500** (stable post-warmup)
+- Final val_bpb: **2.249** (from ~3.1 initial)
+- Factual eval: 1/9 hits
+Compared to previous CPU-HTM baseline (~100 tok/s), the full-GPU HTM delivers
+**~22x end-to-end throughput** — far above the 3-10x target.
+## Bench Commands
+```bash
+source .venv/bin/activate
+export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+# Microbench
+B=8 T=2048 python htm_rust/bench_gpu.py
+# Full training
+HYDRA_TIME_BUDGET=60 HYDRA_BATCH_SIZE=8 HYDRA_TOTAL_BATCH=32768 python -u train.py
+```
+## Known Limitations / Future Work
+- **Segment-compacted launches**: predict/learn/punish iterate all n_cells
+  blocks, using `cell_seg_count` to skip empty cells. A compacted live-cell
+  list would shave another ~40% of launch overhead.
+- **Winner selection**: currently cell 0 of bursting col. Proper least-used
+  selection would help stability of cross-column patterns.
+- **Single CUDA stream per region**: with B=8 regions we serialise on stream 0.
+  Multi-stream would lift the ~20% launch overhead at small batch sizes.
+- **Permanence bump on chronically under-stimulated columns**: SP's strict-parity
+  bump is not mirrored on GPU fast path. Effect on long runs needs measurement.
+- **`seg_num_active_conn` output is reused across reinforce + punish**: the two
+  kernels each launch n_cells blocks. They could be fused into one for one fewer
+  kernel launch per step.
+## Files
+- `htm_rust/build.rs` — nvcc-driven PTX compilation, 12 kernels.
+- `htm_rust/Cargo.toml` — `gpu` feature flag, cudarc dep.
+- `htm_rust/src/gpu/mod.rs` — `HTMRegionGpu` pyclass + `step_many_gpu`.
+- `htm_rust/src/gpu/sp_gpu.rs` — SP state + `step_batch_with_tm`.
+- `htm_rust/src/gpu/tm_gpu.rs` — TM state + `step`.
+- `htm_rust/src/gpu/tests.rs` — parity + correctness tests.
+- `htm_rust/src/gpu/kernels/*.cu` — 5 SP + 7 TM kernels.
+- `htm_rust/bench_gpu.py` — CPU-vs-GPU microbench.
+- `subsystems/htm.py` — transparent GPU/CPU backend selection in `HTMLayer`.

overlay/htm_rust/src/gpu/fused.rs CHANGED Viewed

@@ -132,12 +132,7 @@ pub(crate) fn plan_fused_launch(
     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
-    // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
-    // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
-    // 256 regs/thread which is ample. Compensate with more blocks via
-    // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
-    // 1024 works fine, but 256 is safe everywhere.
-    let block_dim_x = 256u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
     // result for residency estimation only.
@@ -145,10 +140,11 @@ pub(crate) fn plan_fused_launch(
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
-    // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
-    // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
     let default_grid_cap = 16u32;
-    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
@@ -464,21 +460,15 @@ pub fn launch_fused(
                 return Err(DriverError(ret));
             }
         } else {
-            // Pre-Hopper: cooperative kernel launch. The fused kernel uses
-            // grid.sync() for cross-block synchronization which REQUIRES
-            // cuLaunchCooperativeKernel (normal launch silently crashes on
-            // the first grid.sync() call).
-            let ret = sys::lib().cuLaunchCooperativeKernel(
                 fused.raw_kernel.function,
-                grid_x, 1, 1,
-                block_x, 1, 1,
-                0,  // sharedMemBytes
                 cu_stream,
-                kernel_params.as_mut_ptr(),
-            );
-            if ret != sys::CUresult::CUDA_SUCCESS {
-                return Err(DriverError(ret));
-            }
         }
     }
@@ -644,18 +634,15 @@ pub(super) fn launch_fused_batched_raw(
                 return Err(DriverError(ret));
             }
         } else {
-            // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
-            let ret = sys::lib().cuLaunchCooperativeKernel(
                 function_batched,
-                grid_x, b as u32, 1,
-                block_x, 1, 1,
-                0,  // sharedMemBytes
                 cu_stream,
-                kernel_params.as_mut_ptr(),
-            );
-            if ret != sys::CUresult::CUDA_SUCCESS {
-                return Err(DriverError(ret));
-            }
         }
     }

     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
+    let block_dim_x = 1024u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
     // result for residency estimation only.
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
+    // Cluster constraint: grid_dim_x must equal the cluster size (16) so that
+    // each region maps to exactly one cluster. `HTM_FUSED_GRID_CAP` can lower
+    // this for debugging but should not exceed 16 for cluster correctness.
     let default_grid_cap = 16u32;
+    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap).min(16);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
                 return Err(DriverError(ret));
             }
         } else {
+            // Fallback for devices that don't support cluster launch.
+            result::launch_kernel(
                 fused.raw_kernel.function,
+                (grid_x, 1, 1),
+                (block_x, 1, 1),
+                0,
                 cu_stream,
+                &mut kernel_params,
+            )?;
         }
     }
                 return Err(DriverError(ret));
             }
         } else {
+            // Fallback: plain non-cooperative launch for non-Hopper devices.
+            result::launch_kernel(
                 function_batched,
+                (grid_x, b as u32, 1),
+                (block_x, 1, 1),
+                0,
                 cu_stream,
+                &mut kernel_params,
+            )?;
         }
     }

overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu CHANGED Viewed

@@ -124,21 +124,13 @@ struct FusedConfig {
 //
 // The flags / expected / phase / cooperative_grid_sync parameters are kept
 // in the signature for call-site compatibility but are unused.
-__device__ static inline void fused_grid_barrier(cg::grid_group grid,
                                                  unsigned int * /* flags — unused */,
                                                  unsigned int /* expected — unused */,
                                                  unsigned int /* phase — unused */,
                                                  unsigned int /* cooperative_grid_sync — unused */) {
-#if __CUDA_ARCH__ >= 900
-    // Hopper+ : hardware cluster barrier (~10-40 ns)
     auto cluster = cg::this_cluster();
     cluster.sync();
-#else
-    // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
-    // Requires cooperative kernel launch. ~us-ms range, adequate for HTM
-    // workload (kernel launch frequency is low).
-    grid.sync();
-#endif
 }
 __device__ static inline unsigned int warp_sum_u32(unsigned int v) {
@@ -195,26 +187,17 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
     // DSMEM: Cluster-distributed shared memory for hot per-column
     // state (inhibition_threshold, boost, active_duty).
     //
-    // On Hopper (sm_90+): Each block in the cluster owns a contiguous
-    // slice of columns in its own __shared__ arrays. Any block can
-    // peer-read another block's slice via cluster.map_shared_rank().
     //
-    // On Ampere (sm_86) and other pre-Hopper: No cluster support.
-    // Read/write directly from/to global memory (inhibition_threshold,
-    // boost, active_duty device pointers). Slightly higher latency but
-    // functionally correct.
     // =========================================================
-#if __CUDA_ARCH__ >= 900
-    // Hopper+ cluster path
     auto cluster = cg::this_cluster();
     const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
     const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
-#else
-    // Pre-Hopper: no cluster, each block is independent.
-    const unsigned int cluster_block_rank = blockIdx.x;
-    const unsigned int cluster_sz         = gridDim.x;
-#endif
     // Partition n_cols evenly across cluster blocks.
     // Each block owns cols_per_block columns starting at my_col_start.
@@ -226,27 +209,27 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
         (my_col_start + cols_per_block < n_cols)
             ? (my_col_start + cols_per_block) : n_cols;        // clamp
-#if __CUDA_ARCH__ >= 900
     // Cluster-distributed shared memory arrays.
     // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
     // Peer blocks address into each other's smem via map_shared_rank.
     __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
-#endif
-    // TMA multicast input staging tile (T9) — HOPPER ONLY.
     //
-    // On Hopper: cg::memcpy_async with cluster scope multicasts input to all
-    // 16 SMs, reducing DRAM traffic by ~16×.
-    // On Ampere: 32 KB smem allocation exceeds per-block budget when
-    // cooperatively launched (48 KB total, registers eat the rest). Skip the
-    // tile entirely — Stage A reads from GMEM directly (original path).
-#if __CUDA_ARCH__ >= 900
     __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
-#endif
-#if __CUDA_ARCH__ >= 900
     // Initial GMEM → smem load (reads state from previous forward call).
     // Each block loads only its own slice; tid strides across the slice.
     for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
@@ -259,11 +242,6 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
     // All blocks in the cluster must finish loading before any block
     // starts reading peer smem inside the T-loop.
     cluster.sync();
-#else
-    // Pre-Hopper: no smem caching needed — reads go directly to GMEM.
-    // Grid sync ensures all blocks have completed Phase 0 init before T-loop.
-    grid.sync();
-#endif
     const unsigned int S   = cfg.synapses_per_col;
     const unsigned int cpc = cfg.cells_per_column;
@@ -329,19 +307,32 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
         // Ordering: BARRIER 1 completes before we issue the DMA.
         // The DMA completes before Stage A reads s_input_tile.
         // =========================================================
-#if __CUDA_ARCH__ >= 900
         const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
         if (use_input_tile) {
             auto tb = cg::this_thread_block();
             cg::memcpy_async(tb, s_input_tile,
                              inputs + inp_off,
                              cfg.input_bits);
             cg::wait(tb);
             cluster.sync();
         }
-#else
-        const bool use_input_tile = false;
-#endif
         // =========================================================
         // STAGE A: Spatial Pooler
@@ -359,31 +350,22 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
                 float p = syn_perm[base + s];
                 // T9: read from cluster-broadcast tile when available;
                 // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
-#if __CUDA_ARCH__ >= 900
                 unsigned int inp_byte = use_input_tile
                     ? (unsigned int)s_input_tile[b]
                     : (unsigned int)inputs[inp_off + b];
-#else
-                unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
-#endif
                 unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
                 local += hit;
             }
             unsigned int overlap = warp_sum_u32(local);
             overlap = __shfl_sync(0xffffffffu, overlap, 0);
-            // Read boost + threshold for column c.
-#if __CUDA_ARCH__ >= 900
-            // Hopper: read from cluster-distributed shared memory.
             const unsigned int owner_block  = c / cols_per_block;
             const unsigned int owner_offset = c - owner_block * cols_per_block;
             float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
             float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
-#else
-            // Pre-Hopper: read directly from global memory.
-            float boost_val = boost[c];
-            float thr       = inhibition_threshold[c];
-#endif
             float boosted = (float)overlap * boost_val;
             unsigned int is_active = (boosted > thr) ? 1u : 0u;
@@ -401,13 +383,9 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
                 for (unsigned int s = lane; s < S; s += 32u) {
                     unsigned int b = syn_bit[base + s];
                     float p = syn_perm[base + s];
-#if __CUDA_ARCH__ >= 900
                     unsigned int inp_byte = use_input_tile
                         ? (unsigned int)s_input_tile[b]
                         : (unsigned int)inputs[inp_off + b];
-#else
-                    unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
-#endif
                     if (inp_byte != 0u) {
                         p += cfg.sp_inc;
                         if (p > 1.0f) p = 1.0f;
@@ -420,20 +398,15 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
             }
             // active_duty EMA + threshold adaptation.
-            // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
             if (lane == 0) {
-#if __CUDA_ARCH__ >= 900
                 float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
-#else
-                float ad = active_duty[c];
-#endif
                 float sample = is_active ? 1.0f : 0.0f;
                 ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
-#if __CUDA_ARCH__ >= 900
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
-#endif
                 active_duty[c] = ad;
                 // Threshold steers toward target sparsity.
@@ -442,23 +415,50 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
                 if (new_thr < 0.1f) new_thr = 0.1f;
                 if (new_thr > 1000.0f) new_thr = 1000.0f;
-#if __CUDA_ARCH__ >= 900
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
-#endif
                 inhibition_threshold[c] = new_thr;
             }
         }
         // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
         //
-        // On Hopper: cluster.sync() ensures all peer smem writes from this
-        // timestep are visible to all blocks before Stage B / next t.
-        // On pre-Hopper: no smem peer writes occur (all state in GMEM),
-        // so no extra sync needed here — the grid barrier below suffices.
-#if __CUDA_ARCH__ >= 900
         cluster.sync();
-#endif
         // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
         // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
@@ -660,7 +660,7 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
 }
 // Single-region kernel (legacy call site).
-__global__ __launch_bounds__(256, 2)
 void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
     htm_fused_step_body(P, cfg);
 }
@@ -668,7 +668,7 @@ void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
 // Batched kernel: one cooperative launch for B regions. grid.y = B,
 // grid.x = per-region block count. Each block reads its region's
 // FusedPtrs from the device array via blockIdx.y.
-__global__ __launch_bounds__(256, 2)
 void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
     const FusedPtrs P = P_arr[blockIdx.y];
     htm_fused_step_body(P, cfg);

 //
 // The flags / expected / phase / cooperative_grid_sync parameters are kept
 // in the signature for call-site compatibility but are unused.
+__device__ static inline void fused_grid_barrier(cg::grid_group /* grid */,
                                                  unsigned int * /* flags — unused */,
                                                  unsigned int /* expected — unused */,
                                                  unsigned int /* phase — unused */,
                                                  unsigned int /* cooperative_grid_sync — unused */) {
     auto cluster = cg::this_cluster();
     cluster.sync();
 }
 __device__ static inline unsigned int warp_sum_u32(unsigned int v) {
     // DSMEM: Cluster-distributed shared memory for hot per-column
     // state (inhibition_threshold, boost, active_duty).
     //
+    // Each block in the cluster owns a contiguous slice of
+    // [my_col_start, my_col_end) columns in its own __shared__
+    // arrays. Any block can peer-read another block's slice via
+    // cluster.map_shared_rank(ptr, owner_block_rank)[offset].
     //
+    // This eliminates 2×n_cols×T GMEM reads per forward call
+    // (read + potential re-read of threshold/boost/duty per timestep).
     // =========================================================
     auto cluster = cg::this_cluster();
     const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
     const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
     // Partition n_cols evenly across cluster blocks.
     // Each block owns cols_per_block columns starting at my_col_start.
         (my_col_start + cols_per_block < n_cols)
             ? (my_col_start + cols_per_block) : n_cols;        // clamp
     // Cluster-distributed shared memory arrays.
     // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
     // Peer blocks address into each other's smem via map_shared_rank.
     __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
+    // TMA multicast input staging tile (T9).
+    //
+    // On Hopper (sm_90a), cg::memcpy_async with cluster scope issues a single
+    // TMA DMA that multicasts the source data to all 16 SMs in the cluster
+    // simultaneously — replacing ~16 per-block GMEM reads per timestep with a
+    // single hardware DMA.  After cg::wait(cluster) every SM's s_input_tile
+    // is populated identically without any additional DRAM traffic.
+    //
+    // Fallback: when cfg.input_bits > INPUT_BITS_MAX the tile is bypassed
+    // and each thread reads directly from GMEM (original path).
     //
+    // Alignment: 16-byte aligned to satisfy TMA descriptor requirements.
     __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
     // Initial GMEM → smem load (reads state from previous forward call).
     // Each block loads only its own slice; tid strides across the slice.
     for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
     // All blocks in the cluster must finish loading before any block
     // starts reading peer smem inside the T-loop.
     cluster.sync();
     const unsigned int S   = cfg.synapses_per_col;
     const unsigned int cpc = cfg.cells_per_column;
         // Ordering: BARRIER 1 completes before we issue the DMA.
         // The DMA completes before Stage A reads s_input_tile.
         // =========================================================
         const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
         if (use_input_tile) {
+            // Thread-block scope async copy: each SM independently loads
+            // its own input tile from GMEM into shared memory.
+            //
+            // NOTE: CUDA 12.1's cooperative_groups::memcpy_async() rejects
+            // cluster_group at compile time (static_assert in async.h:171).
+            // True TMA multicast (single DMA for all 16 SMs in the cluster)
+            // would require raw PTX cp.async.bulk.tensor with multicast mode,
+            // which needs cuTensorMap descriptors on the host side (T11).
+            //
+            // This per-SM path still gives a meaningful win: it converts
+            // the original per-synapse scattered GMEM reads (random access
+            // pattern hitting multiple cache lines) into one sequential DMA
+            // per SM, improving L2 hit rate and hardware prefetcher
+            // effectiveness.  The cluster.sync() below ensures all SMs in
+            // the cluster have finished loading before any SM enters Stage A.
             auto tb = cg::this_thread_block();
             cg::memcpy_async(tb, s_input_tile,
                              inputs + inp_off,
                              cfg.input_bits);
             cg::wait(tb);
+            // Cluster barrier: all 16 SMs must have loaded their tile
+            // before any SM begins reading s_input_tile in Stage A.
             cluster.sync();
         }
         // =========================================================
         // STAGE A: Spatial Pooler
                 float p = syn_perm[base + s];
                 // T9: read from cluster-broadcast tile when available;
                 // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
                 unsigned int inp_byte = use_input_tile
                     ? (unsigned int)s_input_tile[b]
                     : (unsigned int)inputs[inp_off + b];
                 unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
                 local += hit;
             }
             unsigned int overlap = warp_sum_u32(local);
             overlap = __shfl_sync(0xffffffffu, overlap, 0);
+            // Determine which cluster block owns column c and read
+            // boost + threshold from that block's shared memory.
             const unsigned int owner_block  = c / cols_per_block;
             const unsigned int owner_offset = c - owner_block * cols_per_block;
             float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
             float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
             float boosted = (float)overlap * boost_val;
             unsigned int is_active = (boosted > thr) ? 1u : 0u;
                 for (unsigned int s = lane; s < S; s += 32u) {
                     unsigned int b = syn_bit[base + s];
                     float p = syn_perm[base + s];
                     unsigned int inp_byte = use_input_tile
                         ? (unsigned int)s_input_tile[b]
                         : (unsigned int)inputs[inp_off + b];
                     if (inp_byte != 0u) {
                         p += cfg.sp_inc;
                         if (p > 1.0f) p = 1.0f;
             }
             // active_duty EMA + threshold adaptation.
+            // Writes go to both peer DSMEM (hot path for next timestep)
+            // and GMEM (persistence across forward calls).
             if (lane == 0) {
                 float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
                 float sample = is_active ? 1.0f : 0.0f;
                 ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
                 active_duty[c] = ad;
                 // Threshold steers toward target sparsity.
                 if (new_thr < 0.1f) new_thr = 0.1f;
                 if (new_thr > 1000.0f) new_thr = 1000.0f;
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
                 inhibition_threshold[c] = new_thr;
             }
         }
         // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
         //
+        // DATA FLOW PROOF (T-loop iteration invariant):
+        //
+        // WRITE SITES (lane==0 inside Stage A per-col loop):
+        //   Line 328: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad
+        //   Line 338: cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset] = new_thr
+        //
+        // READ SITES (Stage A of the NEXT timestep t+1):
+        //   Line 290: cluster.map_shared_rank(s_boost,      owner_block)[owner_offset]  (read)
+        //   Line 291: cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset]  (read)
+        //   Line 323: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset]  (read)
+        //
+        // PARTITION MISMATCH (root cause of T8 staleness):
+        //   cols_per_block = ceil(n_cols / cluster_sz)   [smem partition]
+        //   col_lo/col_hi  = floor(gwarp*n_cols/n_warps) [gwarp work partition]
+        //   These are NOT identical — up to 1 column can spill across partition boundaries.
+        //   Example: n_cols=1000, cluster_sz=16 → cols_per_block=63, block 1 col_lo=62
+        //   → block 1 processes column 62 but column 62 belongs to block 0's smem slice.
+        //   → block 1 issues a PEER WRITE to block 0's s_inhib_thr / s_active_duty.
+        //
+        // RACE WITHOUT SYNC:
+        //   Blocks run Stage A concurrently. Block 1 writes block 0's smem at column 62.
+        //   Block 0 may simultaneously READ s_inhib_thr[62] for its own column 62 in
+        //   Stage A of the same timestep → concurrent peer write + local read → undefined.
+        //   Additionally, without cluster.sync() after all peer writes complete, block 0's
+        //   t+1 Stage A reads might observe t-1 values still cached in its smem.
+        //
+        // FIX: cluster.sync() here, AFTER Stage A's per-column loop, ensures:
+        //   1. All peer smem writes from this timestep are globally visible to all blocks.
+        //   2. No block can enter Stage B (or start t+1 Stage A) with stale smem values.
+        //   3. GMEM writes (lines 329, 339) are already committed to L2; __threadfence()
+        //      below ensures they are visible to all SMs before the cluster barrier.
+        //
+        // ORDERING: write → cluster.sync() here → __threadfence() → cluster.sync() in
+        //           fused_grid_barrier → next-timestep reads.  Both visibility guarantees
+        //           are now satisfied.
         cluster.sync();
         // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
         // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
 }
 // Single-region kernel (legacy call site).
+__global__
 void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
     htm_fused_step_body(P, cfg);
 }
 // Batched kernel: one cooperative launch for B regions. grid.y = B,
 // grid.x = per-region block count. Each block reads its region's
 // FusedPtrs from the device array via blockIdx.y.
+__global__
 void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
     const FusedPtrs P = P_arr[blockIdx.y];
     htm_fused_step_body(P, cfg);

overlay/htm_rust/uv.lock ADDED Viewed

	@@ -0,0 +1,8 @@

+version = 1
+revision = 3
+requires-python = ">=3.11"
+[[package]]
+name = "htm-rust"
+version = "0.1.0"
+source = { editable = "." }

overlay/hydra/__init__.py CHANGED Viewed

@@ -10,6 +10,15 @@ from hydra.engram import GPUEngram
 from hydra.model import PostSemClawModel, norm
 from hydra.optimizer import MuonAdamW, adamw_step_fused, muon_step_fused
 __all__ = [
     "PostSemClawConfig",
     "GPUEngram",
@@ -18,4 +27,5 @@ __all__ = [
     "MuonAdamW",
     "adamw_step_fused",
     "muon_step_fused",
 ]

 from hydra.model import PostSemClawModel, norm
 from hydra.optimizer import MuonAdamW, adamw_step_fused, muon_step_fused
+# config_from_dict is imported lazily (via attribute access on hydra.training)
+# to keep `import hydra` cheap; re-export here for convenience.
+def __getattr__(name: str):
+    if name == "config_from_dict":
+        from hydra.training import config_from_dict as _cfd
+        return _cfd
+    raise AttributeError(name)
 __all__ = [
     "PostSemClawConfig",
     "GPUEngram",
     "MuonAdamW",
     "adamw_step_fused",
     "muon_step_fused",
+    "config_from_dict",
 ]

overlay/hydra/config.py CHANGED Viewed

@@ -8,7 +8,39 @@ body imports these constants; zero behavior change from the extraction.
 from __future__ import annotations
 import os
-from dataclasses import dataclass
 # ---------------------------------------------------------------------------
 # CUDA env — set before importing torch in entry point. Kept here so any
@@ -60,6 +92,23 @@ class PostSemClawConfig:
     htm_n_columns: int = 2048
     htm_cells_per_column: int = 32
     # Label smoothing + Z-loss
     label_smoothing: float = 0.0   # disabled: any smoothing hurts in 5-min budget
     z_loss_weight: float = 1e-4
@@ -105,6 +154,60 @@ CE_CHUNK = int(os.environ.get("HYDRA_CE_CHUNK", "1024"))
 DROPOUT = float(os.environ.get("HYDRA_DROPOUT", "0.2"))
 FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
 # Factual eval knobs
 FACTUAL_SAMPLES = int(os.environ.get("HYDRA_FACTUAL_SAMPLES", "3"))
 FACTUAL_BATCH = int(os.environ.get("HYDRA_FACTUAL_BATCH", "32"))

 from __future__ import annotations
 import os
+from dataclasses import dataclass, field
+def _parse_hyena_layers_env() -> tuple[int, ...]:
+    """Parse HYDRA_HYENA_LAYERS env var into a sorted tuple of layer indices.
+    Used as the default_factory for PostSemClawConfig.hyena_layers so a fresh
+    config construction reads the current env var, but once constructed the
+    value is first-class and travels with checkpoints (see asdict(config) in
+    save_ckpt). Ckpt-load sets the dataclass field explicitly, overriding the
+    env-var default.
+    Returns empty tuple when env var is unset/empty (byte-identical to
+    pre-port behavior: no Hyena layers).
+    """
+    raw = os.environ.get("HYDRA_HYENA_LAYERS", "")
+    if not raw:
+        return ()
+    return tuple(sorted({int(s.strip()) for s in raw.split(",") if s.strip()}))
+def _parse_gdn_layers_env() -> tuple[int, ...]:
+    """Parse HYDRA_GDN_LAYERS env var into a sorted tuple of layer indices.
+    Same contract as _parse_hyena_layers_env: layers whose index is listed
+    here use GatedDeltaNet (fla.layers.GatedDeltaNet) as a drop-in
+    replacement for Mamba3. Empty tuple = no GDN layers (byte-identical
+    to baseline).
+    """
+    raw = os.environ.get("HYDRA_GDN_LAYERS", "")
+    if not raw:
+        return ()
+    return tuple(sorted({int(s.strip()) for s in raw.split(",") if s.strip()}))
 # ---------------------------------------------------------------------------
 # CUDA env — set before importing torch in entry point. Kept here so any
     htm_n_columns: int = 2048
     htm_cells_per_column: int = 32
+    # Hyena supplement layer indices (sorted tuple). Defaults to the
+    # HYDRA_HYENA_LAYERS env var at config-construction time, but once
+    # persisted in a checkpoint the value is first-class and survives even
+    # when the env var is unset at resume time. This fixes the ckpt-reload
+    # crash path where a model trained with `HYDRA_HYENA_LAYERS=3,7` saves
+    # HyenaBlock params but a fresh process without the env var would try
+    # to build a pure-Mamba3 architecture and reject the state_dict as
+    # `Missing/Unexpected key(s)`.
+    hyena_layers: tuple[int, ...] = field(default_factory=_parse_hyena_layers_env)
+    # GatedDeltaNet supplement layer indices (sorted tuple). Same semantics
+    # as hyena_layers — a layer index listed here uses GDNBlock (fla-backed
+    # Gated DeltaNet) instead of Mamba3. Selections are mutually exclusive
+    # with hyena_layers at construction time (hyena wins on overlap; the
+    # model loop checks hyena first).
+    gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
     # Label smoothing + Z-loss
     label_smoothing: float = 0.0   # disabled: any smoothing hurts in 5-min budget
     z_loss_weight: float = 1e-4
 DROPOUT = float(os.environ.get("HYDRA_DROPOUT", "0.2"))
 FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
+# ---------------------------------------------------------------------------
+# Learnability knobs (all OFF by default — zero behavior change unless set)
+# ---------------------------------------------------------------------------
+# 1) Multi-Token Prediction (Llama-3 style). K=1 disables (next-1 only). K=4
+#    adds 3 extra weight-tied heads; loss = mean of K position-shifted CEs.
+MTP_K = int(os.environ.get("HYDRA_MTP_K", "1"))
+# 2) Exponential Moving Average of model weights (decay=0.999). Saves an
+#    additional latest_ema.pt at the end of training.
+USE_EMA = os.environ.get("HYDRA_USE_EMA", "0") == "1"
+EMA_DECAY = float(os.environ.get("HYDRA_EMA_DECAY", "0.999"))
+# 3) Gradient checkpointing on Mamba3 block forward. Trades ~30% compute for
+#    ~40% activation memory savings — lets you push B upward on a 3060.
+GRAD_CKPT = os.environ.get("HYDRA_GRAD_CKPT", "0") == "1"
+# 4) Doc-separator masking in packed sequences: at every packed-BOS position
+#    in the targets tensor, mask the loss (ignore_index=-1) so the model is
+#    not forced to predict doc B from doc A's context.
+DOC_SEP_MASK = os.environ.get("HYDRA_DOC_SEP_MASK", "0") == "1"
+# 5) Stop-gradient on HTM state (belt-and-braces: htm_rust already runs under
+#    torch.no_grad() so the tensor returned has requires_grad=False; this
+#    simply detaches explicitly to harden graph hygiene against future refactors).
+HTM_STOP_GRAD = os.environ.get("HYDRA_HTM_STOP_GRAD", "0") == "1"
+# 6) Output entropy penalty: loss += -lambda * H(softmax(logits)). Negative
+#    entropy penalizes peaked distributions and breaks repetition loops.
+ENTROPY_PENALTY = float(os.environ.get("HYDRA_ENTROPY_PENALTY", "0.0"))
+# 7) Curriculum: first N optimizer steps use short seq_len, then switch to
+#    full. 0 disables (no curriculum).
+CURRICULUM_SHORT_STEPS = int(os.environ.get("HYDRA_CURRICULUM_SHORT_STEPS", "0"))
+CURRICULUM_SHORT_SEQ_LEN = int(os.environ.get("HYDRA_CURRICULUM_SHORT_SEQ_LEN", "256"))
+# ---------------------------------------------------------------------------
+# Hyena supplement (additional block type for selected layer indices).
+# Hyena replaces Mamba3 at the specified layer indices while all other layers
+# remain Mamba3. Empty string (default) → no Hyena layers, byte-identical to
+# pre-port behavior.
+#   HYDRA_HYENA_LAYERS       "3,7"  — comma-separated 0-indexed layer ids
+#   HYDRA_HYENA_ORDER         2     — Hyena recurrence order (>= 2)
+#   HYDRA_HYENA_FILTER_DIM    64    — implicit-filter MLP hidden width
+# Hyena reference: https://arxiv.org/pdf/2302.10866.pdf (HazyResearch/safari).
+# ---------------------------------------------------------------------------
+HYENA_LAYERS = os.environ.get("HYDRA_HYENA_LAYERS", "")
+HYENA_ORDER = int(os.environ.get("HYDRA_HYENA_ORDER", "2"))
+HYENA_FILTER_DIM = int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64"))
+# Filter-rfft cache modes (see subsystems/hyena_pure.py):
+#   HYDRA_HYENA_FILTER_CACHE=1 — eval-only cache. Safe under torch.no_grad()
+#       where PyTorch never saves intermediate tensors. Off by default.
+#   HYDRA_HYENA_TRAIN_CACHE=1  — training-safe cache using a deferred
+#       gradient pattern. Cuts the implicit filter MLP forward to ONCE per
+#       optimizer step regardless of grad-accumulation factor. Requires the
+#       training loop (see hydra/lightning_module.py::optimizer_step) to
+#       call `model.flush_hyena_pending_grads()` before optimizer.step().
+#       Off by default.
+HYENA_FILTER_CACHE = os.environ.get("HYDRA_HYENA_FILTER_CACHE", "0") == "1"
+HYENA_TRAIN_CACHE = os.environ.get("HYDRA_HYENA_TRAIN_CACHE", "0") == "1"
 # Factual eval knobs
 FACTUAL_SAMPLES = int(os.environ.get("HYDRA_FACTUAL_SAMPLES", "3"))
 FACTUAL_BATCH = int(os.environ.get("HYDRA_FACTUAL_BATCH", "32"))

overlay/hydra/data_module.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""Lightning DataModule + IterableDataset for HYDRA pretraining.
+Replaces the custom threading/queue pipeline in prepare_nemotron.make_dataloader
+with a standard multiprocessing DataLoader approach.
+Design:
+  • IterableStreamDataset: each worker opens its own HF streams for the 7-way
+    blend, tokenizes with rustbpe, packs into (T+1,) rows via best-fit, and
+    yields one row per __next__.
+  • HydraDataModule: wraps the dataset with a standard DataLoader using
+    num_workers>=1, prefetch_factor=4, pin_memory=True. Lightning handles
+    device transfer.
+  • Val stream: deterministic seed 12345, weights match training blend.
+The worker RNG is seeded per-worker so the weighted-sampling schedule is
+independent across workers (else all workers request the same config at
+the same step and prefetching serializes).
+Env vars (all preserved from prepare_nemotron):
+  HYDRA_SEQ_LEN                  — sequence length T (default 512)
+  HYDRA_BATCH_SIZE               — batch size B (default 1) — passed through
+                                    to DataLoader
+  HYDRA_STREAM_SHUFFLE_BUFFER    — HF shuffle buffer (default 2048)
+  HYDRA_USE_FULL_BLEND           — 7-way blend vs 5-way Nemotron phase
+  HYDRA_USE_NEMOTRON             — enables streaming path (else shard path)
+  HYDRA_FACTUAL_INJECT_RATE      — factual doc injection cadence
+  HYDRA_NEMOTRON_PHASE           — phase1|phase2 (when not full blend)
+  HYDRA_DATA_NUM_WORKERS         — DataLoader num_workers (default 2)
+  HYDRA_DATA_PREFETCH            — DataLoader prefetch_factor (default 4)
+  HYDRA_DATA_BUFFER              — doc_buffer size for best-fit packing
+                                    (default 1000)
+"""
+from __future__ import annotations
+import os
+import random
+from typing import Iterator
+import numpy as np
+import torch
+import lightning as L
+from torch.utils.data import DataLoader, IterableDataset, get_worker_info
+import prepare as _prepare
+import prepare_nemotron as _p_nemo
+from prepare_nemotron import (
+    FULL_BLEND_WEIGHTS,
+    PHASE1_WEIGHTS,
+    PHASE2_WEIGHTS,
+    _BLEND_REGISTRY,
+    _extract_text,
+    _open_stream,
+)
+# ---------------------------------------------------------------------------
+# Worker-local weighted stream. A stripped version of prepare_nemotron's
+# _WeightedStream that is constructed inside each worker. Adds worker sharding:
+# when num_workers > 1 the RNG is seeded per-worker, so different workers
+# sample different config sequences and pull disjoint shard assignments from
+# HF's shuffle buffer.
+# ---------------------------------------------------------------------------
+class _WorkerWeightedStream:
+    def __init__(self, weights: dict[str, float], base_seed: int, worker_id: int):
+        self.configs = list(weights.keys())
+        self.weights = [weights[c] for c in self.configs]
+        self.base_seed = base_seed
+        self.worker_id = worker_id
+        # Each worker opens its own HF streams. _open_stream returns an iter()
+        # over a streaming dataset, with an internal shuffle buffer.
+        self.streams = {c: _open_stream(c, "train") for c in self.configs}
+        # Per-worker RNG so the config-choice trajectory is independent.
+        self.rng = random.Random(base_seed + worker_id * 7919)
+        self.epoch = 1
+        # Lazy-init factual docs (once per worker). The main-process version
+        # in prepare_nemotron._WeightedStream reads these on first __next__.
+        self._factual_docs: list[str] | None = None
+        self._factual_idx = 0
+        self._inject_counter = 0
+        inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
+        self._inject_rate = inject_rate
+        if inject_rate > 0:
+            factual_path = os.path.join(
+                os.path.dirname(os.path.abspath(_p_nemo.__file__)),
+                "data", "factual", "facts.txt",
+            )
+            if os.path.exists(factual_path):
+                with open(factual_path) as fh:
+                    self._factual_docs = fh.read().strip().split("\n")
+    def _reopen(self, config: str) -> None:
+        self.streams[config] = _open_stream(config, "train")
+        self.epoch += 1
+    def __iter__(self):
+        return self
+    def __next__(self) -> tuple[str, int]:
+        # Factual injection (preserves prepare_nemotron cadence).
+        if self._inject_rate > 0 and self._factual_docs:
+            self._inject_counter += 1
+            if self._inject_counter >= self._inject_rate:
+                self._inject_counter = 0
+                doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
+                self._factual_idx += 1
+                return doc, self.epoch
+        config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
+        try:
+            row = next(self.streams[config])
+        except StopIteration:
+            self._reopen(config)
+            row = next(self.streams[config])
+        return _extract_text(row), self.epoch
+# ---------------------------------------------------------------------------
+# IterableStreamDataset — yields (T+1,) packed rows. No threads. No queues.
+# Lives inside each DataLoader worker. DataLoader's own multiprocessing stacks
+# rows into batches of shape (B, T+1) and sends them to the main process.
+# ---------------------------------------------------------------------------
+class IterableStreamDataset(IterableDataset):
+    """Streams docs, tokenizes, packs into (T+1,) rows via best-fit.
+    Each worker gets its own instance (via fork/spawn) and initializes its
+    own HF streams + rustbpe tokenizer + factual injector. The tokenizer
+    pickled blob is small (~1 MB) and thread-safe per tiktoken docs.
+    """
+    def __init__(
+        self,
+        split: str,
+        seq_len: int,
+        *,
+        base_seed: int = 0,
+        doc_buffer_size: int = 1000,
+        tokenizer_batch: int = 128,
+    ):
+        super().__init__()
+        assert split in ("train", "val"), split
+        self.split = split
+        self.seq_len = seq_len
+        self.row_capacity = seq_len + 1
+        self.base_seed = base_seed
+        self.doc_buffer_size = doc_buffer_size
+        self.tokenizer_batch = tokenizer_batch
+    def _pick_weights(self) -> dict[str, float]:
+        if self.split == "val":
+            if os.environ.get("HYDRA_USE_FULL_BLEND", "0") == "1":
+                return FULL_BLEND_WEIGHTS
+            return {"Nemotron-Pretraining-Multiple-Choice": 1.0}
+        if os.environ.get("HYDRA_USE_FULL_BLEND", "0") == "1":
+            return FULL_BLEND_WEIGHTS
+        phase = os.environ.get("HYDRA_NEMOTRON_PHASE", "phase1").strip().lower()
+        return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        info = get_worker_info()
+        worker_id = 0 if info is None else info.id
+        # Each worker builds its own tokenizer instance. tiktoken's Encoding
+        # object is pickleable and the underlying C++ BPE is thread-safe;
+        # per-worker instantiation avoids cross-process sharing headaches.
+        tokenizer = _prepare.Tokenizer.from_directory()
+        bos = tokenizer.get_bos_token_id()
+        # Each worker gets its own weighted HF stream. Seed offset ensures
+        # disjoint config-choice trajectories; HF's own shuffle buffer handles
+        # shard randomization.
+        val_seed = 12345  # deterministic val
+        seed = val_seed if self.split == "val" else self.base_seed
+        stream = _WorkerWeightedStream(
+            self._pick_weights(), base_seed=seed, worker_id=worker_id,
+        )
+        row_capacity = self.row_capacity
+        doc_buffer: list[list[int]] = []
+        doc_batch_size = self.tokenizer_batch
+        def refill_buffer() -> None:
+            # Collect doc_batch_size text strings, then batch-tokenize.
+            texts: list[str] = []
+            for _ in range(doc_batch_size):
+                text, _epoch = next(stream)
+                if text:
+                    texts.append(text)
+            if texts:
+                token_lists = tokenizer.encode(texts, prepend=bos)
+                doc_buffer.extend(token_lists)
+        while True:
+            pos = 0
+            row = torch.empty(row_capacity, dtype=torch.long)
+            while pos < row_capacity:
+                while len(doc_buffer) < self.doc_buffer_size:
+                    refill_buffer()
+                remaining = row_capacity - pos
+                # Best-fit packing: largest doc that fully fits.
+                best_idx = -1
+                best_len = 0
+                for i, doc in enumerate(doc_buffer):
+                    dlen = len(doc)
+                    if dlen <= remaining and dlen > best_len:
+                        best_idx = i
+                        best_len = dlen
+                if best_idx >= 0:
+                    doc = doc_buffer.pop(best_idx)
+                    row[pos : pos + len(doc)] = torch.tensor(doc, dtype=torch.long)
+                    pos += len(doc)
+                else:
+                    # No doc fits remaining space — crop shortest to fill.
+                    shortest_idx = min(
+                        range(len(doc_buffer)),
+                        key=lambda i: len(doc_buffer[i]),
+                    )
+                    doc = doc_buffer.pop(shortest_idx)
+                    row[pos : pos + remaining] = torch.tensor(
+                        doc[:remaining], dtype=torch.long,
+                    )
+                    pos += remaining
+            yield row
+# ---------------------------------------------------------------------------
+# LightningDataModule
+# ---------------------------------------------------------------------------
+class HydraDataModule(L.LightningDataModule):
+    def __init__(
+        self,
+        batch_size: int | None = None,
+        seq_len: int | None = None,
+        num_workers: int | None = None,
+        prefetch_factor: int | None = None,
+    ):
+        super().__init__()
+        self.batch_size = batch_size or int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
+        self.seq_len = seq_len or int(os.environ.get("HYDRA_SEQ_LEN", "512"))
+        self.num_workers = (
+            num_workers
+            if num_workers is not None
+            else int(os.environ.get("HYDRA_DATA_NUM_WORKERS", "2"))
+        )
+        self.prefetch_factor = (
+            prefetch_factor
+            if prefetch_factor is not None
+            else int(os.environ.get("HYDRA_DATA_PREFETCH", "4"))
+        )
+        self.doc_buffer = int(os.environ.get("HYDRA_DATA_BUFFER", "1000"))
+    def _make_loader(self, split: str, seed: int) -> DataLoader:
+        dataset = IterableStreamDataset(
+            split=split,
+            seq_len=self.seq_len,
+            base_seed=seed,
+            doc_buffer_size=self.doc_buffer,
+        )
+        # num_workers=0 → main-process iteration (useful for debugging). With
+        # IterableDataset the DataLoader batches the rows into (B, T+1) via
+        # default torch.stack-collate.
+        kw: dict = dict(
+            dataset=dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            drop_last=True,
+        )
+        if self.num_workers > 0:
+            kw["prefetch_factor"] = self.prefetch_factor
+            kw["persistent_workers"] = True
+        return DataLoader(**kw)
+    def train_dataloader(self) -> DataLoader:
+        return self._make_loader("train", seed=0)
+    def val_dataloader(self) -> DataLoader:
+        return self._make_loader("val", seed=12345)

overlay/hydra/diffusion_loss.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""MDLM Rao-Blackwellized Masked Diffusion Loss.
+Implements the masked-diffusion ELBO from:
+    Sahoo et al., "Simple and Effective Masked Diffusion Language Models" (MDLM),
+    NeurIPS 2024, arXiv:2406.07524.
+Equations referenced:
+    - Forward process: eq. 2  (per-token Bernoulli masking at rate 1 - alpha_t)
+    - Log-linear schedule:    alpha_t = 1 - t,  t ~ Uniform(0, 1)
+    - RB-ELBO:     eq. 7-8   L_RB = E_t E_q [ (1/alpha_t) * CE(x_theta(x_t), x_0) ]
+                              where the expectation over masked positions.
+Key insight: the Rao-Blackwellized estimate replaces an average over all masks
+(exponential) by a closed-form weighted CE that applies weight 1/alpha_t only
+on the positions that were masked, and 0 on unmasked positions. This gives an
+unbiased estimator with lower variance than a naive Monte Carlo over mask
+patterns.
+Reference implementation cross-checked against:
+    https://github.com/kuleshov-group/mdlm  (diffusion.py::DiffusionModel._loss)
+"""
+from __future__ import annotations
+from typing import Literal
+import torch
+import torch.nn.functional as F
+# Clamping weight keeps gradients finite while still up-weighting high-noise
+# positions. Historical value 1/eps=1000 blew up HYDRA training on a 12h v2
+# launch (2026-04-22): loss 26 → 42 → NaN in 13 steps under Muon lr=7e-3
+# because per-token CE × 1000 saturated the 100-unit FAIL guard. The MDLM
+# paper reports stable training at Adam lr=1e-4; HYDRA uses Muon at 7e-3
+# (70× larger), so the weight clamp needs to compensate.
+#
+# Tunable via HYDRA_MDLM_MAX_WEIGHT (default 5.0). Set =1.0 to disable
+# weighting entirely (flat masked-LM CE, no RB reweighting — simpler and
+# more stable, sacrifices the theoretical ELBO property).
+import os as _os
+_MAX_WEIGHT: float = float(_os.environ.get("HYDRA_MDLM_MAX_WEIGHT", "5.0"))
+_MIN_ALPHA: float = 1.0 / _MAX_WEIGHT  # so clamp(alpha, min=_MIN_ALPHA) gives 1/alpha <= _MAX_WEIGHT
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def mdlm_masked_forward_process(
+    targets: torch.Tensor,
+    mask_token_id: int,
+    t: torch.Tensor | None = None,
+    alpha_schedule: Literal["linear", "loglinear"] = "loglinear",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """MDLM forward (noising) process: mask tokens and compute RB weights.
+    Args:
+        targets: (B, T) int64 token ids — the clean sequence x_0.
+        mask_token_id: The special token id used to represent a masked token.
+        t: (B,) float in (0, 1). If None, samples Uniform(0, 1) per batch
+            element. t=0 means fully clean; t=1 means fully masked.
+        alpha_schedule: Noise schedule.
+            "loglinear" (MDLM default): alpha_t = 1 - t
+            "linear": identical formula — both are provided for completeness
+            since the paper calls the 1-t schedule "log-linear" in the context
+            of the ELBO derivation.
+    Returns:
+        x_t           : (B, T) int64 — noised sequence; masked positions hold
+                        mask_token_id, unmasked positions equal targets.
+        mask_positions: (B, T) bool  — True where the token was masked.
+        loss_weights  : (B, T) float32 — RB weighting factor. On masked
+                        positions: 1/alpha_t (clamped to _MAX_WEIGHT). On
+                        unmasked positions: 0.0. Summing
+                        (CE * loss_weights * mask_positions).sum() / mask.sum()
+                        gives the per-sample RB-ELBO estimator.
+    """
+    B, T = targets.shape
+    device = targets.device
+    dtype = torch.float32
+    # --- sample or validate t ---
+    if t is None:
+        # Uniform(0, 1) per batch element; avoid exactly 0 and 1.
+        t = torch.rand(B, device=device, dtype=dtype)
+    else:
+        t = t.to(device=device, dtype=dtype)
+        if t.shape != (B,):
+            raise ValueError(f"t must be shape (B,)={(B,)}, got {t.shape}")
+        if (t < 0).any() or (t > 1).any():
+            raise ValueError("t must be in [0, 1]")
+    # --- noise schedule: alpha_t = probability that a token is NOT masked ---
+    # Both "linear" and "loglinear" in MDLM use alpha_t = 1 - t; the paper
+    # refers to "log-linear" because the schedule is linear in the *log* domain
+    # of the forward process probability. We expose both names for clarity.
+    if alpha_schedule in ("linear", "loglinear"):
+        alpha_t = 1.0 - t          # (B,) float, in [0, 1]
+    else:
+        raise ValueError(f"Unknown alpha_schedule: {alpha_schedule!r}. Use 'linear' or 'loglinear'.")
+    # --- per-token Bernoulli mask ---
+    # alpha_t[:, None] broadcasts to (B, T).
+    alpha_t_expanded = alpha_t[:, None]                # (B, 1)
+    # Bernoulli(1 - alpha_t) = 1 means "mask this token".
+    # We sample independently per token, per batch element.
+    rand = torch.rand(B, T, device=device, dtype=dtype)
+    mask_positions = rand > alpha_t_expanded           # (B, T) bool
+    # True  → masked position
+    # False → unmasked (kept as original)
+    # --- build x_t ---
+    x_t = targets.clone()
+    x_t = torch.where(mask_positions, torch.full_like(x_t, mask_token_id), x_t)
+    # --- RB loss weights: 1/alpha_t on masked positions, 0 elsewhere ---
+    # Clamp alpha_t so weights stay finite near t→1.
+    safe_alpha = alpha_t.clamp(min=_MIN_ALPHA)         # (B,)
+    weight_per_sample = 1.0 / safe_alpha               # (B,)
+    # Broadcast to (B, T) and zero out unmasked positions.
+    loss_weights = weight_per_sample[:, None].expand(B, T).to(dtype=dtype)  # (B, T)
+    loss_weights = loss_weights * mask_positions.float()
+    return x_t, mask_positions, loss_weights
+def mdlm_rb_loss(
+    logits: torch.Tensor,
+    targets: torch.Tensor,
+    mask_positions: torch.Tensor,
+    loss_weights: torch.Tensor,
+    ignore_index: int = -100,
+) -> torch.Tensor:
+    """Rao-Blackwellized negative ELBO.
+    Applies the MDLM loss: cross-entropy on masked positions only, weighted
+    per-token by loss_weights, averaged over the batch.
+    The formula (eq. 7-8 of arXiv:2406.07524):
+        L_RB = mean_B [ sum_T (weight_t * CE(logits_i, target_i) * mask_i)
+                        / max(sum_T(mask_i), 1) ]
+    Args:
+        logits        : (B, T, V) raw logits. May be bf16; internally cast to
+                        float32 for CE computation.
+        targets       : (B, T) int64 true token ids (x_0).
+        mask_positions: (B, T) bool — True = masked position.
+        loss_weights  : (B, T) float32 — 1/alpha_t on masked positions, 0 elsewhere.
+        ignore_index  : Passed to F.cross_entropy; positions with this label
+                        are excluded from the loss.
+    Returns:
+        Scalar float32 loss. Returns 0.0 tensor if no positions are masked.
+    """
+    B, T, V = logits.shape
+    # Ensure float32 for numerical stability; F.cross_entropy accepts fp16/bf16
+    # logits but accumulates in float internally anyway. Being explicit avoids
+    # silent precision surprises.
+    logits_f = logits.float()                          # (B, T, V)
+    # Build targets with ignore_index on UNmasked positions so CE only fires
+    # where mask_positions is True. We also honour any pre-existing -100 values
+    # (e.g. doc-separator masking upstream).
+    targets_masked = torch.where(
+        mask_positions & (targets != ignore_index),
+        targets,
+        torch.full_like(targets, ignore_index),
+    )
+    # Per-token CE; shape (B, T). Positions with ignore_index → 0 from CE.
+    per_tok_ce = F.cross_entropy(
+        logits_f.reshape(B * T, V),
+        targets_masked.reshape(B * T),
+        ignore_index=ignore_index,
+        reduction="none",
+    ).reshape(B, T)                                    # (B, T) float32
+    # Apply RB weight. loss_weights already has 0 on unmasked positions.
+    weighted = per_tok_ce * loss_weights               # (B, T)
+    # Per-sample mean over masked positions, then average over batch.
+    mask_f = mask_positions.float()                    # (B, T)
+    per_sample_mask_count = mask_f.sum(dim=1).clamp(min=1)   # (B,)
+    per_sample_loss = weighted.sum(dim=1) / per_sample_mask_count  # (B,)
+    return per_sample_loss.mean()                      # scalar float32
+def mdlm_loss(
+    logits: torch.Tensor,
+    targets: torch.Tensor,
+    mask_token_id: int,
+    t: torch.Tensor | None = None,
+    alpha_schedule: Literal["linear", "loglinear"] = "loglinear",
+    ignore_index: int = -100,
+) -> torch.Tensor:
+    """Convenience wrapper: forward process + RB-ELBO in one call.
+    Suitable for the common case where the caller has full-vocab logits and
+    wants a drop-in replacement for a standard masked-LM CE loss.
+    Args:
+        logits        : (B, T, V) raw logits.
+        targets       : (B, T) int64 clean token ids.
+        mask_token_id : The MASK token id used to corrupt the input.
+        t             : Optional (B,) timestep in (0, 1). Sampled if None.
+        alpha_schedule: "loglinear" (default) or "linear".
+        ignore_index  : Token id to ignore in the loss (e.g. padding).
+    Returns:
+        Scalar float32 MDLM RB-ELBO loss.
+    Note on sampled-softmax / partial logits:
+        If your model only computes logits for a subset of vocab positions
+        (e.g. HYDRA's sampled-softmax head), call mdlm_masked_forward_process
+        and mdlm_rb_loss separately. mdlm_rb_loss expects full-vocab logits.
+    """
+    x_t, mask_positions, loss_weights = mdlm_masked_forward_process(
+        targets=targets,
+        mask_token_id=mask_token_id,
+        t=t,
+        alpha_schedule=alpha_schedule,
+    )
+    # x_t is produced for the model's input (not used by this convenience
+    # wrapper since logits are already provided by the caller). In a real
+    # training loop the caller feeds x_t into the model to get logits, THEN
+    # calls this function. See the orchestrator wiring note in training.py.
+    return mdlm_rb_loss(
+        logits=logits,
+        targets=targets,
+        mask_positions=mask_positions,
+        loss_weights=loss_weights,
+        ignore_index=ignore_index,
+    )

overlay/hydra/engram.py CHANGED Viewed

@@ -1,19 +1,48 @@
-"""GPU Engram — conditional memory with Hebbian writes.
-Extracted verbatim from train.py (W1 modularization). Semantics unchanged.
-Note on grad_accum>=2 autograd safety (previously suspected bug):
-- `self.memory` is the nn.Parameter keys table.
-- Forward reads `self.memory[indices]` (gradient-bearing lookup).
-- Hebbian write `self.memory.data.index_add_(...)` mutates storage via .data
-  WITHOUT bumping the autograd version counter. This means PyTorch will NOT
-  raise "modified in-place" on subsequent backward passes for the previously-
-  saved `retrieved` tensor. The mutation does give slightly stale gradients
-  for backward1 after forward1's write (by design — Hebbian is a one-shot EMA
-  write, not a gradient signal), but it does NOT break autograd.
-- Live test on RTX 3060 at batch=8, total=32768 (grad_accum=2) runs cleanly
-  for 69 steps. The bug reported in the mandate was already closed by the
-  F7 revert (persistent stacked_params_buf removal in MuonAdamW).
 """
 from __future__ import annotations
@@ -21,23 +50,71 @@ from __future__ import annotations
 import torch
 import torch.nn as nn
 class GPUEngram(nn.Module):
-    """GPU-native Engram with Hebbian writes. No Rust."""
-    def __init__(self, d_model: int, n_columns: int = 1024, max_ngram: int = 3) -> None:
         super().__init__()
         self.n_columns = n_columns
         self.max_ngram = max_ngram
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
         nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
-        # Fast n-gram hash: XOR of shifted token IDs with primes.
-        # Unrolled for max_ngram=3 (no Python loop).
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
@@ -50,18 +127,43 @@ class GPUEngram(nn.Module):
             h = h ^ (shifted2 * self.primes[2])
         return h % self.n_columns
     def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
-        indices = self._hash(token_ids)  # (B, T)
-        # Gradient-bearing memory lookup: backprop flows through to self.memory
-        # so the keys learn via autograd alongside the Hebbian EMA writes below.
-        retrieved = self.memory[indices]  # (B, T, d_model)
-        alpha = torch.sigmoid(self.gate(x))
-        # Vectorized Hebbian write via index_add_ (no expand_as alloc)
-        if self.training:
             with torch.no_grad():
-                flat_idx = indices.reshape(-1)  # (B*T,)
                 flat_x = x.detach().reshape(-1, x.shape[-1])  # (B*T, d_model)
                 mem_dtype = self.memory.data.dtype
                 updates = (
@@ -70,6 +172,6 @@ class GPUEngram(nn.Module):
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
-        # hit_rate = soft gate average — keep as tensor, defer .item() to caller
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

+"""GPU Engram — Sparse Modern Hopfield retrieval path.
+## What changed (scatter-gather → Hopfield matmul)
+The original forward used `self.memory[indices]` (scatter-gather), which misses
+L2 cache at n_columns > 4096 and creates a hard tps ceiling.
+The replacement uses:
+    scores   = x @ self.memory.T          # (B, T, n_columns) — coalesced matmul
+    weights  = entmax15(scores, dim=-1)   # sparse attention; 95%+ exact zeros
+    retrieved = weights @ self.memory     # (B, T, d_model)   — coalesced matmul
+Both matmuls are tile-friendly (cuBLAS GEMM), so L2 reuse is high regardless of
+n_columns. Gradient flows through both matmuls so `self.memory` learns via
+autograd in addition to (or instead of) the Hebbian EMA writes.
+## Sparsity mechanism
+alpha-entmax with alpha=1.5 (entmax15) is a sparse attention operator that maps
+logit vectors to distributions where many entries are *exactly* zero (not merely
+small). It generalises softmax (alpha=1) and argmax (alpha→∞). At n_columns=1024
+with d_model=64 a random batch typically hits ≥95% zero entries — the key
+property that keeps bandwidth proportional to *attended* columns, not all columns.
+Fallback: if `entmax` is not pip-installed, top-k softmax (k=32) is used instead.
+This is chosen at module-import time — NO runtime branching per forward call.
+## token_ids argument
+token_ids is accepted for API compatibility with the rest of the hydra stack
+(train.py, lightning_module.py call `engram(x, token_ids)`). It is NOT used in
+the retrieval path — the Hopfield path computes dense similarity over the whole
+memory bank, which subsumes any hash-based column selection. Documented here to
+prevent confusion.
+## Hebbian writes (hebbian_boost=False by default)
+With Hopfield retrieval, gradient signals reach self.memory through autograd, so
+Hebbian EMA writes are no longer critical. They are preserved as an *optional*
+boost (hebbian_boost=True) for experiments that want both signals. Default is off.
+## Checkpoint compatibility
+`self.memory` shape (n_columns, d_model) is unchanged, so existing .pt / .ckpt
+files load without modification.
 """
 from __future__ import annotations
 import torch
 import torch.nn as nn
+# ---------------------------------------------------------------------------
+# Sparse-attention backend — chosen ONCE at import time, no runtime branching.
+# ---------------------------------------------------------------------------
+try:
+    from entmax import entmax15 as _entmax15  # type: ignore[import]
+    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:
+        """alpha-entmax (alpha=1.5): truly sparse distribution over last dim."""
+        return _entmax15(scores, dim=-1)
+    _BACKEND = "entmax15"
+except ImportError:  # pragma: no cover — entmax always installed in CI
+    _K = 32  # top-k for fallback
+    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:  # type: ignore[misc]
+        """Top-k softmax fallback: zero outside the k highest-scoring columns."""
+        topk_vals, topk_idx = scores.topk(_K, dim=-1)
+        topk_w = torch.softmax(topk_vals, dim=-1)
+        weights = torch.zeros_like(scores)
+        weights.scatter_(-1, topk_idx, topk_w)
+        return weights
+    _BACKEND = "topk32"
 class GPUEngram(nn.Module):
+    """GPU Engram: Sparse Modern Hopfield retrieval.
+    Args:
+        d_model:       Model dimension — must match the surrounding transformer.
+        n_columns:     Number of memory columns (key-value pairs). Safe at 32 768
+                       with the matmul path; the old scatter-gather had an L2
+                       cliff above ~4 096.
+        max_ngram:     Retained for API compatibility; unused in retrieval path.
+        hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
+                       during training (old behaviour, now optional). Default False.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_columns: int = 1024,
+        max_ngram: int = 3,
+        hebbian_boost: bool = False,
+    ) -> None:
         super().__init__()
         self.n_columns = n_columns
         self.max_ngram = max_ngram
+        self.hebbian_boost = hebbian_boost
+        # Shape unchanged from original — existing checkpoints load cleanly.
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
         nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
+        # Retained for any external code that reads these attrs.
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
+    # ------------------------------------------------------------------
+    # _hash: retained for API/checkpoint compat; unused in forward below.
+    # ------------------------------------------------------------------
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
+        """N-gram hash → column index (kept for backward-compat; not used in retrieval)."""
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
             h = h ^ (shifted2 * self.primes[2])
         return h % self.n_columns
+    # ------------------------------------------------------------------
+    # forward
+    # ------------------------------------------------------------------
     def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
+        """Hopfield retrieve + soft gate + residual.
+        Args:
+            x:         (B, T, d_model) — input activations.
+            token_ids: (B, T) — token indices. Accepted for API compatibility;
+                       NOT used in the retrieval path (see module docstring).
+        Returns:
+            (x + alpha * retrieved, hit_rate)
+            - x + alpha * retrieved: (B, T, d_model)
+            - hit_rate: scalar tensor — fraction of gate values > 0.1
+        """
+        # ---- 1. Similarity scores (coalesced GEMM) ----------------------
+        # scores[b, t, c] = dot(x[b,t], memory[c])
+        scores = x @ self.memory.T  # (B, T, n_columns)
+        # ---- 2. Sparse attention weights --------------------------------
+        # _sparse_attention is fixed at import time (entmax15 or top-k).
+        weights = _sparse_attention(scores)  # (B, T, n_columns), many exact zeros
+        # ---- 3. Retrieved vector (coalesced GEMM) -----------------------
+        retrieved = weights @ self.memory  # (B, T, d_model)
+        # ---- 4. Soft gate (unchanged) -----------------------------------
+        alpha = torch.sigmoid(self.gate(x))  # (B, T, 1)
+        # ---- 5. Optional Hebbian EMA write ------------------------------
+        if self.training and self.hebbian_boost:
             with torch.no_grad():
+                # Reuse the hash-based indices for the write target (sparse update).
+                indices = self._hash(token_ids)
+                flat_idx = indices.reshape(-1)           # (B*T,)
                 flat_x = x.detach().reshape(-1, x.shape[-1])  # (B*T, d_model)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
+        # ---- 6. Residual + hit_rate -------------------------------------
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

overlay/hydra/eval.py CHANGED Viewed

@@ -8,14 +8,12 @@ Perf optimizations (eval_perf_fix):
 - Batched factual probes: single padded forward instead of N sequential
 """
-from __future__ import annotations
-import math
-import os
-import re as _re
-from typing import NotRequired, TypedDict
-import torch
 from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS
@@ -38,241 +36,13 @@ FACTUAL_EVAL = [
     ("Two plus two equals", ["4", "four"]),
 ]
-_FACTUAL_PROBES = [
     "The capital of France is",
     "Water boils at",
     "The largest planet in our solar system is",
     "The speed of light is approximately",
     "Shakespeare wrote",
-]
-class _InstructionCase(TypedDict):
-    prompt: str
-    kind: str
-    contains: NotRequired[list[str]]
-_INSTRUCTION_FOLLOWING_PROMPTS: list[_InstructionCase] = [
-    {"prompt": "Answer with exactly one word: the sky on a clear day is", "kind": "one_word", "contains": ["blue"]},
-    {"prompt": "Respond with YES or NO only: Is fire cold?", "kind": "yes_no", "contains": ["yes", "no"]},
-    {"prompt": "Continue the sequence: 2, 4, 6, 8,", "kind": "contains", "contains": ["10"]},
-    {"prompt": "Write exactly three comma-separated fruits:", "kind": "comma_three"},
-]
-def _word_tokens(text: str) -> list[str]:
-    return [w.lower() for w in _re.findall(r"\b[\w'-]+\b", text)]
-def compute_diversity_metrics(samples: list[str]) -> dict[str, float]:
-    """Compute lightweight lexical diversity/repetition metrics.
-    Metrics are intentionally simple and cheap so they can run in every job:
-    - distinct_1: unique unigrams / total unigrams
-    - distinct_2: unique bigrams / total bigrams
-    - repetition_rate: 1 - distinct_1
-    - repetition_bigram_rate: repeated bigrams / total bigrams
-    """
-    tokens: list[str] = []
-    for sample in samples:
-        tokens.extend(_word_tokens(sample))
-    if not tokens:
-        return {
-            "distinct_1": 0.0,
-            "distinct_2": 0.0,
-            "repetition_rate": 0.0,
-            "repetition_bigram_rate": 0.0,
-        }
-    unigrams = set(tokens)
-    distinct_1 = len(unigrams) / len(tokens)
-    bigrams = list(zip(tokens, tokens[1:]))
-    if not bigrams:
-        return {
-            "distinct_1": float(distinct_1),
-            "distinct_2": 0.0,
-            "repetition_rate": float(1.0 - distinct_1),
-            "repetition_bigram_rate": 0.0,
-        }
-    counts: dict[tuple[str, str], int] = {}
-    for bg in bigrams:
-        counts[bg] = counts.get(bg, 0) + 1
-    repeated = sum(1 for _, count in counts.items() if count > 1)
-    distinct_2 = len(counts) / len(bigrams)
-    return {
-        "distinct_1": float(distinct_1),
-        "distinct_2": float(distinct_2),
-        "repetition_rate": float(1.0 - distinct_1),
-        "repetition_bigram_rate": float(repeated / len(bigrams)),
-    }
-def _generate_continuation(
-    model,
-    tokenizer,
-    prompt: str,
-    *,
-    max_seq_len: int,
-    gen_tokens: int = 16,
-    temperature: float = 0.9,
-) -> str:
-    ids = tokenizer.encode(prompt)
-    ctx = torch.tensor([ids], device="cuda", dtype=torch.long)
-    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for _ in range(gen_tokens):
-            logits = model(ctx, targets=None)
-            next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
-            if temperature <= 0:
-                next_id = torch.argmax(next_logits, dim=-1, keepdim=True)
-            else:
-                probs = torch.softmax(next_logits.float() / temperature, dim=-1)
-                next_id = torch.multinomial(probs, num_samples=1)
-            ctx = torch.cat([ctx, next_id], dim=1)
-            if ctx.size(1) >= max_seq_len:
-                break
-    generated = tokenizer.decode(ctx[0].tolist())
-    return generated[len(prompt):].strip()
-def _score_instruction_completion(kind: str, completion: str, contains: list[str] | None = None) -> bool:
-    text = completion.strip().lower()
-    words = _word_tokens(text)
-    contains = contains or []
-    if kind == "one_word":
-        return len(words) == 1 and any(c in text for c in contains)
-    if kind == "yes_no":
-        return len(words) >= 1 and words[0] in {"yes", "no"}
-    if kind == "contains":
-        return any(c in text for c in contains)
-    if kind == "comma_three":
-        parts = [p.strip() for p in completion.split(",") if p.strip()]
-        return len(parts) == 3
-    return False
-def run_instruction_following_proxy(model, tokenizer, max_seq_len: int):
-    """Run a small proxy suite for instruction-following behavior."""
-    print("---")
-    print("instruction_following_samples:")
-    model.eval()
-    hits = 0
-    outputs: list[str] = []
-    for case in _INSTRUCTION_FOLLOWING_PROMPTS:
-        prompt = case["prompt"]
-        kind = case["kind"]
-        contains = case.get("contains")
-        completion = _generate_continuation(
-            model,
-            tokenizer,
-            prompt,
-            max_seq_len=max_seq_len,
-            gen_tokens=16,
-            temperature=0.8,
-        )
-        ok = _score_instruction_completion(
-            kind,
-            completion,
-            contains,
-        )
-        outputs.append(completion)
-        if ok:
-            hits += 1
-        print(f"  prompt: {prompt!r}")
-        print(f"  output: {completion.replace(chr(10), ' ')!r}")
-        print(f"  hit:    {ok}")
-    score = hits / len(_INSTRUCTION_FOLLOWING_PROMPTS)
-    print("---")
-    print(f"instruction_following_score: {score:.4f}")
-    print(f"instruction_following_hits:  {hits}/{len(_INSTRUCTION_FOLLOWING_PROMPTS)}")
-    return score, hits, len(_INSTRUCTION_FOLLOWING_PROMPTS), outputs
-def compute_token_calibration(
-    model,
-    tokenizer,
-    max_seq_len: int,
-    batch_size: int,
-    *,
-    num_batches: int = 2,
-    n_bins: int = 10,
-) -> dict[str, float]:
-    """Estimate token-level calibration metrics (ECE and Brier score)."""
-    if num_batches <= 0:
-        return {
-            "calibration_ece": 0.0,
-            "calibration_brier": 0.0,
-            "calibration_accuracy": 0.0,
-            "calibration_tokens": 0.0,
-        }
-    import prepare as _prepare_mod
-    from prepare import make_dataloader as _make_dataloader
-    val_loader = _make_dataloader(tokenizer, batch_size, max_seq_len, "val")
-    bin_count = [0 for _ in range(n_bins)]
-    bin_correct = [0 for _ in range(n_bins)]
-    bin_conf_sum = [0.0 for _ in range(n_bins)]
-    total_tokens = 0
-    total_correct = 0
-    brier_sum = 0.0
-    model.eval()
-    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for _ in range(num_batches):
-            x, y, _ = next(val_loader)
-            logits = model(x, targets=None)
-            if logits.dim() == 2:
-                logits = logits.unsqueeze(1)
-            probs = torch.softmax(logits.float(), dim=-1)
-            conf, pred = torch.max(probs, dim=-1)
-            correct = pred.eq(y)
-            conf_flat = conf.reshape(-1)
-            correct_flat = correct.reshape(-1)
-            total_tokens += int(conf_flat.numel())
-            total_correct += int(correct_flat.sum().item())
-            for c, ok in zip(conf_flat.tolist(), correct_flat.tolist()):
-                bidx = min(int(math.floor(c * n_bins)), n_bins - 1)
-                bin_count[bidx] += 1
-                bin_conf_sum[bidx] += c
-                if ok:
-                    bin_correct[bidx] += 1
-                brier_sum += (1.0 - c) ** 2 if ok else c ** 2
-    if total_tokens == 0:
-        return {
-            "calibration_ece": 0.0,
-            "calibration_brier": 0.0,
-            "calibration_accuracy": 0.0,
-            "calibration_tokens": 0.0,
-        }
-    ece = 0.0
-    for idx in range(n_bins):
-        if bin_count[idx] == 0:
-            continue
-        acc = bin_correct[idx] / bin_count[idx]
-        avg_conf = bin_conf_sum[idx] / bin_count[idx]
-        ece += abs(acc - avg_conf) * (bin_count[idx] / total_tokens)
-    return {
-        "calibration_ece": float(ece),
-        "calibration_brier": float(brier_sum / total_tokens),
-        "calibration_accuracy": float(total_correct / total_tokens),
-        "calibration_tokens": float(total_tokens),
-    }
 def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:

 - Batched factual probes: single padded forward instead of N sequential
 """
+from __future__ import annotations
+import os
+import re as _re
+import torch
 from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS
     ("Two plus two equals", ["4", "four"]),
 ]
+_FACTUAL_PROBES = [
     "The capital of France is",
     "Water boils at",
     "The largest planet in our solar system is",
     "The speed of light is approximately",
     "Shakespeare wrote",
+]
 def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:

overlay/hydra/gdn_block.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""GDNBlock — Gated Delta Net block, drop-in shape-compatible with Mamba3Block and HyenaBlock.
+GatedDeltaNet (GDN) reference: arXiv:2412.06464 (ICLR 2025, NVLabs).
+Implementation: flash-linear-attention (fla) library, Triton kernels, sm86-compatible.
+Interface contract (MUST match how Mamba3/Hyena are called in hydra/model.py):
+    block = GDNBlock(d_model, ...)
+    y = block(x)    # x: [B, T, d_model]  ->  y: [B, T, d_model]
+The surrounding mHC layer does NOT pre-norm before calling this block (the
+raw hidden state is passed in); the block itself applies no input normalization,
+same as HyenaBlock.  We return the raw operator output; the mHC layer adds it
+as a residual stream contribution.
+NO attention, NO softmax-over-sequence-dim.  All state is stateless between
+.forward() calls by default (use_cache=False, past_key_values=None).
+"""
+from __future__ import annotations
+try:
+    from fla.layers.gated_deltanet import GatedDeltaNet as _GatedDeltaNet
+except ImportError as _fla_err:
+    raise ImportError(
+        "flash-linear-attention (fla) is required for GDNBlock but could not be imported. "
+        "Install it with:\n"
+        "    pip install flash-linear-attention\n"
+        "or from source:\n"
+        "    pip install git+https://github.com/fla-org/flash-linear-attention.git\n"
+        f"Original error: {_fla_err}"
+    ) from _fla_err
+import torch
+import torch.nn as nn
+class GDNBlock(nn.Module):
+    """Gated Delta Net block, drop-in shape-compatible with HYDRA's Mamba3Block and HyenaBlock.
+    Wraps `fla.layers.GatedDeltaNet` with the same external API that
+    `hydra.hyena_block.HyenaBlock` exposes:
+        forward(x: Tensor[B, T, d_model]) -> Tensor[B, T, d_model]
+    Internal GatedDeltaNet.forward returns a 3-tuple
+    (hidden_states, attn_weights, past_key_values); we extract [0] and
+    return only the hidden states, keeping the residual stream unchanged.
+    GDN outperforms Mamba-2 on in-context retrieval benchmarks (MQAR, etc.)
+    at equal or faster compute, making it a targeted fix for HYDRA's factual
+    plateau.
+    Parameter counts are deliberately kept within 2x of a Mamba3 block at the
+    same d_model/n_heads to be drop-in affordable.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int = 6,
+        mode: str = "chunk",       # 'chunk' for training, 'fused_recurrent' for inference
+        expand_v: float = 2.0,     # value-projection expansion; controls KV memory
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.mode = mode
+        # head_dim must divide d_model.  GDN uses separate q/k head_dim from v;
+        # we set head_dim for q/k such that n_heads * head_dim == d_model.
+        if d_model % n_heads != 0:
+            raise ValueError(
+                f"d_model={d_model} must be divisible by n_heads={n_heads} "
+                "so that head_dim = d_model // n_heads is an integer."
+            )
+        head_dim = d_model // n_heads
+        self.gdn = _GatedDeltaNet(
+            hidden_size=d_model,
+            expand_v=expand_v,
+            head_dim=head_dim,
+            num_heads=n_heads,
+            mode=mode,
+            use_gate=True,          # gating is the key architectural feature of GDN
+            use_short_conv=use_short_conv,
+            conv_size=conv_size,
+            layer_idx=None,         # no KV-cache layer indexing; we manage state ourselves
+        )
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """x: [B, T, d_model]  ->  y: [B, T, d_model].
+        Passes through GatedDeltaNet with use_cache=False so no recurrent
+        state leaks between independent forward() calls (important for
+        gradient-accumulation loops and eval).
+        """
+        # GatedDeltaNet.forward signature:
+        #   (hidden_states, attention_mask=None, past_key_values=None,
+        #    use_cache=False, output_attentions=False)
+        # Returns: tuple(hidden_states, attn_weights|None, past_kv|None)
+        out, _, _ = self.gdn(
+            hidden_states=x,
+            attention_mask=None,
+            past_key_values=None,
+            use_cache=False,
+            output_attentions=False,
+        )
+        return out
+    # ------------------------------------------------------------------
+    # API parity with HyenaBlock and Mamba3Block
+    # ------------------------------------------------------------------
+    def invalidate_caches(self) -> None:
+        """No-op — GDNBlock holds no persistent filter cache.
+        Provided for API parity with HyenaBlock, which invalidates its
+        Hyena filter cache here.  Calling this is always safe.
+        """
+        pass

overlay/hydra/hyena_block.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""HyenaBlock — drop-in block for HYDRA, supplement to Mamba3.
+Wraps `subsystems.hyena_pure.HyenaOperator` with a pre-norm + residual scheme
+consistent with how the mHC stack wraps Mamba3 in `hydra/model.py`.
+Interface contract (MUST match how Mamba3 is called in model.py):
+    block = HyenaBlock(d_model, seq_len)
+    y = block(x)   # x: [B, T, d_model]  ->  y: [B, T, d_model]
+The surrounding mHC layer does the pre-norm (`norm(h)`) BEFORE calling the
+block, so the block itself should NOT re-normalize at input — same as Mamba3
+in the current model. We return the raw operator output; the mHC layer then
+adds it as a residual stream contribution.
+NO attention, NO softmax-over-sequence-dim, NO KV-cache. All forbidden
+imports enumerated in tests/test_hyena.py (test #7) are absent.
+"""
+from __future__ import annotations
+import os
+import torch
+import torch.nn as nn
+from subsystems.hyena_pure import HyenaOperator
+class HyenaBlock(nn.Module):
+    """Single Hyena block, shape-compatible with Mamba3 in HYDRA."""
+    def __init__(
+        self,
+        d_model: int,
+        seq_len: int,
+        order: int | None = None,
+        filter_order: int | None = None,
+        dropout: float = 0.0,
+        filter_dropout: float = 0.0,
+        short_filter_order: int = 3,
+        activation: str = "id",
+    ):
+        super().__init__()
+        # Env overrides (documented in hydra/config.py).
+        if order is None:
+            order = int(os.environ.get("HYDRA_HYENA_ORDER", "2"))
+        if filter_order is None:
+            filter_order = int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64"))
+        self.d_model = d_model
+        self.seq_len = seq_len
+        self.order = order
+        self.filter_order = filter_order
+        self.operator = HyenaOperator(
+            d_model=d_model,
+            l_max=seq_len,
+            order=order,
+            filter_order=filter_order,
+            dropout=dropout,
+            filter_dropout=filter_dropout,
+            short_filter_order=short_filter_order,
+            activation=activation,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """x: [B, T, d_model]  ->  y: [B, T, d_model]."""
+        return self.operator(x)

overlay/hydra/lightning_module.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""LightningModule wrapping PostSemClawModel.
+Thin adapter. The model and the MuonAdamW optimizer are unchanged. This
+module implements:
+  • configure_optimizers — returns the existing MuonAdamW (subclass of
+    torch.optim.Optimizer) built by model.setup_optimizer. Lightning accepts
+    this directly.
+  • training_step — splits (B, T+1) batches into (x, y), forwards through
+    the model, logs loss / bpb / tps / mfu / vram. Preserves the
+    sampled-softmax path inside PostSemClawModel (no changes there).
+  • optimizer_step — before each step we update LR + muon momentum + WD
+    using the same time-progress schedule as hydra/training.py
+    (get_lr_multiplier / get_muon_momentum / get_weight_decay). Lightning
+    handles grad accumulation via Trainer(accumulate_grad_batches=N).
+The SDR SOM update and Hestia QAT snap are called at the same cadence as
+the legacy loop, but inline on the main thread (Lightning provides its own
+callbacks for async work if we need to extract them later — keeping it
+simple for now).
+Env vars respected:
+  HYDRA_TIME_BUDGET          — wall-clock budget (s) used for LR schedule
+                                and as Trainer max_time
+  HYDRA_HESTIA_INTERVAL      — steps between Hestia snaps (default 100)
+  HYDRA_BATCH_SIZE           — device batch size (for throughput calc)
+  HYDRA_SEQ_LEN              — sequence length (for throughput calc)
+"""
+from __future__ import annotations
+import math
+import os
+import time
+import torch
+import lightning as L
+from hydra.config import (
+    ADAM_BETAS,
+    EMBEDDING_LR,
+    FINAL_LR_FRAC,
+    GPU_BF16_PEAK_FLOPS,
+    MATRIX_LR,
+    SCALAR_LR,
+    UNEMBEDDING_LR,
+    WARMUP_RATIO,
+    WEIGHT_DECAY,
+    PostSemClawConfig,
+)
+from hydra.model import PostSemClawModel
+# ---------------------------------------------------------------------------
+# LR / momentum / wd schedules — verbatim copy of hydra/training.py so the
+# curves match exactly. Kept here to avoid import cycles.
+# ---------------------------------------------------------------------------
+def _lr_multiplier(progress: float) -> float:
+    if progress < WARMUP_RATIO:
+        return progress / WARMUP_RATIO if WARMUP_RATIO > 0 else 1.0
+    decay_progress = (progress - WARMUP_RATIO) / max(1.0 - WARMUP_RATIO, 1e-9)
+    return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * (
+        1 + math.cos(math.pi * decay_progress)
+    )
+def _muon_momentum(step: int) -> float:
+    frac = min(step / 300.0, 1.0)
+    return (1 - frac) * 0.85 + frac * 0.95
+def _weight_decay(progress: float) -> float:
+    return WEIGHT_DECAY * (1 - progress)
+# ---------------------------------------------------------------------------
+class HydraLightningModule(L.LightningModule):
+    """Lightning wrapper. Public attrs: self.model, self.config."""
+    def __init__(self, config: PostSemClawConfig):
+        super().__init__()
+        self.config = config
+        self.model = PostSemClawModel(config)
+        # Model weights init must be deferred to the correct device; done by
+        # caller after construction (to match the meta-device + to_empty()
+        # pattern used in the legacy loop).
+        # Time-based progress tracks the legacy loop's semantics: LR cosine
+        # is driven by wall-clock, not step count. We capture training start
+        # in on_train_start and TIME_BUDGET from env.
+        self.time_budget = float(
+            int(os.environ.get("HYDRA_TIME_BUDGET", "300"))
+        )
+        self._train_start_time: float | None = None
+        self._total_training_time = 0.0
+        self._last_step_end: float | None = None
+        self._hestia_interval = int(os.environ.get("HYDRA_HESTIA_INTERVAL", "100"))
+        self._flops_per_token = 0
+        self._tokens_per_step = 0
+        # Smoothed loss for the header-line log (matches legacy format).
+        self._ema_beta = 0.9
+        self._smooth_loss = 0.0
+        self._bpt_ema = 0.0
+        self._token_bytes: torch.Tensor | None = None
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def on_train_start(self) -> None:
+        self._train_start_time = time.time()
+        self._last_step_end = self._train_start_time
+        self._flops_per_token = self.model.estimate_flops()
+        # Tokens processed per optimizer step (pre-accum).
+        B = int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
+        T = int(os.environ.get("HYDRA_SEQ_LEN", "512"))
+        self._tokens_per_step = B * T
+        # Build/cache token_bytes LUT (for bits-per-byte live metric).
+        import prepare as _p
+        self._token_bytes = _p.get_token_bytes(device=self.device)
+    def configure_optimizers(self):
+        optimizer = self.model.setup_optimizer(
+            unembedding_lr=UNEMBEDDING_LR,
+            embedding_lr=EMBEDDING_LR,
+            scalar_lr=SCALAR_LR,
+            adam_betas=ADAM_BETAS,
+            matrix_lr=MATRIX_LR,
+            weight_decay=WEIGHT_DECAY,
+        )
+        return optimizer
+    # ------------------------------------------------------------------
+    # Training step. Lightning auto-handles: autocast (via precision flag
+    # on Trainer), backward, grad-accum, zero_grad. We only:
+    #   - split batch into (x, y)
+    #   - forward through model (autocast is established by Trainer)
+    #   - return loss (grads flow from return)
+    # ------------------------------------------------------------------
+    def training_step(self, batch: torch.Tensor, batch_idx: int):
+        # DataLoader produces (B, T+1) rows; split into input/target.
+        # Lightning's default collate already moved batch to self.device via
+        # the accelerator callback when pin_memory=True and device != cpu.
+        if batch.dim() != 2:
+            raise RuntimeError(f"Expected (B, T+1) batch, got shape {tuple(batch.shape)}")
+        x = batch[:, :-1].contiguous()
+        y = batch[:, 1:].contiguous()
+        loss = self.model(x, y)
+        # Lightning applies the grad-accum divisor automatically; we just
+        # return the raw loss. loss.detach() is stored for logging.
+        self._log_step(loss.detach(), y)
+        return loss
+    # ------------------------------------------------------------------
+    # Optimizer step hook: update LR / momentum / WD using time-progress.
+    # Runs once per optimizer step (after all accum micro-batches).
+    # ------------------------------------------------------------------
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
+        # Update schedules from wall-clock progress.
+        now = time.time()
+        if self._train_start_time is None:
+            self._train_start_time = now
+            self._last_step_end = now
+        progress = min(self._total_training_time / max(self.time_budget, 1.0), 1.0)
+        step = self.global_step
+        lrm = _lr_multiplier(progress)
+        mom = _muon_momentum(step)
+        wd = _weight_decay(progress)
+        for group in optimizer.param_groups:
+            group["lr"] = group["initial_lr"] * lrm
+            if group.get("kind") == "muon":
+                group["momentum"] = mom
+                group["weight_decay"] = wd
+        # Grad clip (matches legacy loop). Lightning provides this via
+        # Trainer(gradient_clip_val=1.0) but we want the exact call-site.
+        torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
+        # Hyena train-cache: we must flush accumulated micro-batch grads BACK
+        # into the filter MLP params AFTER the accum-backward closure has run
+        # but BEFORE the optimizer actually consumes the grads. Lightning
+        # composes these so the closure runs inside optimizer.step(). We wrap
+        # the closure to insert our flush at the exact right moment.
+        #
+        # Ordering within the wrapped closure:
+        #   1. optimizer_closure() — runs all micro-batch forwards + backwards.
+        #      Each Hyena micro-batch backward accumulates into _k_leaf.grad.
+        #   2. flush_hyena_pending_grads() — one-shot
+        #      torch.autograd.backward(_k_graph, _k_leaf.grad) per HyenaFilter.
+        #      Now filter MLP / pos_emb / bias params have their correct grads.
+        #
+        # No-op when HYDRA_HYENA_TRAIN_CACHE=0 or no Hyena blocks exist.
+        _has_flush = hasattr(self.model, "flush_hyena_pending_grads")
+        if _has_flush:
+            _orig_closure = optimizer_closure
+            def _wrapped_closure():
+                result = _orig_closure()
+                self.model.flush_hyena_pending_grads()
+                return result
+            effective_closure = _wrapped_closure
+        else:
+            effective_closure = optimizer_closure
+        # Run the step (this is what Lightning would have done for us).
+        optimizer.step(closure=effective_closure)
+        self.model.zero_grad(set_to_none=True)
+        # Hyena filter-rfft cache invalidation. No-op if:
+        #   (a) no Hyena layers are in the model, or
+        #   (b) HYDRA_HYENA_FILTER_CACHE=0 and HYDRA_HYENA_TRAIN_CACHE=0
+        #       (the operators never populated either cache)
+        # In either case this is a handful of Python attribute resets.
+        if hasattr(self.model, "invalidate_hyena_caches"):
+            self.model.invalidate_hyena_caches()
+        # Hestia QAT snap every N steps. Temperature anneals every step.
+        progress_now = (now - self._train_start_time) / max(self.time_budget, 1.0)
+        self.model.hestia.anneal_temperature(progress_now)
+        if self._hestia_interval > 0 and step % self._hestia_interval == 0:
+            self.model.hestia.apply_to(self.model)
+        # SDR SOM update when the model stashed an sdr in the last forward.
+        _last_sdr = getattr(self.model, "_last_sdr", None)
+        if _last_sdr is not None and hasattr(self.model.sdr_semantic, "maybe_som_update"):
+            # x from the last training_step is not available here without
+            # captured state; the legacy loop passed (x, _last_sdr). To keep
+            # the interface clean we pass the last batch's x via a buffer.
+            # Since _last_sdr is derived from idx, we reuse self._last_x.
+            if getattr(self, "_last_x", None) is not None:
+                self.model.sdr_semantic.maybe_som_update(self._last_x, _last_sdr)
+        # Advance the wall-clock counter for LR schedule (matches legacy
+        # behavior which incremented only after the first warm-up step).
+        dt = now - (self._last_step_end or now)
+        self._last_step_end = now
+        if step > 10:
+            self._total_training_time += dt
+    # ------------------------------------------------------------------
+    # Logging — mirrors the step=NNNNN line format of the legacy loop so
+    # grep/tee pipelines keep working.
+    # ------------------------------------------------------------------
+    def _log_step(self, loss: torch.Tensor, y: torch.Tensor) -> None:
+        # Stash the current x so optimizer_step can drive SOM update.
+        self._last_x = None  # reset; we will set it below.
+        # We don't have x here (already discarded); emit a None marker that
+        # the SOM hook will silently skip if absent.
+        loss_f = float(loss.item())
+        if not math.isfinite(loss_f) or loss_f > 100:
+            # Let Lightning raise / the trainer callbacks handle this.
+            self.log("train_loss_nan", 1.0)
+            return
+        step = self.global_step
+        self._smooth_loss = (
+            self._ema_beta * self._smooth_loss + (1 - self._ema_beta) * loss_f
+        )
+        debiased = self._smooth_loss / max(1 - self._ema_beta ** (step + 1), 1e-9)
+        dt = max(time.time() - (self._last_step_end or time.time()), 1e-6)
+        tps = int(self._tokens_per_step / dt) if dt > 0 else 0
+        mfu = (
+            100.0
+            * self._flops_per_token
+            * self._tokens_per_step
+            / dt
+            / GPU_BF16_PEAK_FLOPS
+            if dt > 0
+            else 0.0
+        )
+        # bpb live: y flat -> token_bytes LUT -> avg bytes/token
+        bpt = debiased / math.log(2)
+        if self._token_bytes is not None:
+            with torch.no_grad():
+                y_flat = y.reshape(-1)
+                nbytes = self._token_bytes[y_flat]
+                mask = nbytes > 0
+                denom = mask.sum().clamp(min=1).float()
+                avg_bpt = (nbytes.float() * mask.float()).sum() / denom
+                bpt_batch = float(avg_bpt.item())
+            if step == 0 or self._bpt_ema <= 0.0:
+                self._bpt_ema = bpt_batch
+            else:
+                self._bpt_ema = 0.98 * self._bpt_ema + 0.02 * bpt_batch
+        bpb = bpt / max(self._bpt_ema, 1e-6)
+        vram = (
+            torch.cuda.memory_allocated() / 1024 / 1024
+            if torch.cuda.is_available()
+            else 0.0
+        )
+        self.log_dict(
+            {
+                "train/loss": debiased,
+                "train/bpb": bpb,
+                "train/bpt": bpt,
+                "train/tps": float(tps),
+                "train/mfu": float(mfu),
+                "train/vram_mib": float(vram),
+            },
+            prog_bar=False,
+            on_step=True,
+            on_epoch=False,
+        )
+        # Match legacy one-line format: "step=NNNNN loss=x bpb=y tps=z ..."
+        print(
+            f"step={step:05d} loss={debiased:.4f} bpb={bpb:.4f} "
+            f"bpt={bpt:.3f} bpt_div={self._bpt_ema:.2f} "
+            f"tps={tps} dt_ms={dt*1000:.0f} mfu={mfu:.1f} "
+            f"vram={vram:.0f}MiB",
+            flush=True,
+        )

overlay/hydra/model.py CHANGED Viewed

@@ -32,33 +32,23 @@ from __future__ import annotations
 import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-try:
-    from mamba_ssm import Mamba3
-except Exception:
-    Mamba3 = None
 from subsystems.hestia_mini import HestiaQAT
 from subsystems.htm import HTMLayer
 from subsystems.mhc_mini import ManifoldHyperConnection
 from subsystems.sdr_semantic import SemanticFoldingSDR
-from hydra.engram import GPUEngram
-from hydra.optimizer import MuonAdamW
-class _InertMambaBlock(nn.Module):
-    """Identity fallback used when HYDRA_INERT_MAMBA=1."""
-    def __init__(self, d_model: int) -> None:
-        super().__init__()
-        self.d_model = d_model
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return x
 def norm(x: torch.Tensor) -> torch.Tensor:
@@ -78,10 +68,9 @@ class PostSemClawModel(nn.Module):
         model(x, y, reduction='mean')           -> scalar loss
     """
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self._inert_mamba = os.environ.get("HYDRA_INERT_MAMBA", "0") == "1"
         # Token embedding
         self.wte = nn.Embedding(config.vocab_size, config.d_model)
@@ -89,29 +78,48 @@ class PostSemClawModel(nn.Module):
         # Mamba-3 blocks — official mamba-ssm fused CUDA kernel. No fallbacks.
         # RoPE is applied internally by the Mamba3 CUDA kernel via the Angles
         # parameter; external cos/sin buffers are not needed.
-        if self._inert_mamba or Mamba3 is None:
-            if self._inert_mamba:
-                print("[HYDRA] HYDRA_INERT_MAMBA=1 -> using inert identity blocks", flush=True)
-            else:
-                print("[HYDRA] mamba_ssm unavailable -> using inert identity blocks", flush=True)
-            self.blocks = nn.ModuleList([
-                _InertMambaBlock(config.d_model)
-                for _ in range(config.n_layer)
-            ])
-        else:
-            self.blocks = nn.ModuleList([
-                Mamba3(
-                    d_model=config.d_model,
-                    d_state=config.d_state,
-                    expand=config.expand,
-                    headdim=config.headdim,
-                    is_mimo=False,          # SISO path uses stable mamba3_siso_combined kernel
-                    chunk_size=64,          # upstream-recommended SISO chunk; 16 violated tl.dot M>=16 constraint
-                    is_outproj_norm=False,
-                    dtype=torch.bfloat16,
-                )
-                for _ in range(config.n_layer)
-            ])
         # Full-architecture SDR: offline semantic retina + STE (no-bypass).
         self.sdr_semantic = SemanticFoldingSDR(
@@ -157,6 +165,29 @@ class PostSemClawModel(nn.Module):
         # LM head
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         # Residual dropout
         self.drop = nn.Dropout(float(os.environ.get("HYDRA_DROPOUT", "0.2")))
@@ -294,6 +325,41 @@ class PostSemClawModel(nn.Module):
         self.htm_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)
     def estimate_flops(self) -> int:
         nparams = sum(p.numel() for p in self.parameters())
         embed_params = self.wte.weight.numel()
@@ -334,10 +400,33 @@ class PostSemClawModel(nn.Module):
         embedding_params = list(self.wte.parameters())
         lm_head_params = list(self.lm_head.parameters())
-        # Matrix params -> Muon (exactly 2D weight matrices).
         matrix_params = []
-        for p in self.blocks.parameters():
-            if p.dim() == 2:
                 matrix_params.append(p)
         # NOTE (W1 audit REG-2): SemanticFoldingSDR.delta_u / delta_v are
         # currently GRADIENT-DEAD. The forward path uses `binary_only(idx)` for
@@ -350,11 +439,11 @@ class PostSemClawModel(nn.Module):
         # for p in self.sdr_semantic.parameters():
         #     if p.dim() == 2:
         #         matrix_params.append(p)
-        for p in self.htm_proj.parameters():
-            if p.dim() == 2:
                 matrix_params.append(p)
-        for p in self.engram.parameters():
-            if p.dim() == 2:
                 matrix_params.append(p)
         # SDR params are intentionally not in any optimizer group — they
@@ -483,6 +572,13 @@ class PostSemClawModel(nn.Module):
             sdr_active_bits = float(self.sdr_semantic.target_active)
             htm_anomaly = htm_out[..., -1].mean()
         # Gradient bridge: HTM columns+anomaly -> d_model.
         htm_proj_out = self.htm_proj(htm_out.to(dense_emb.dtype))
         x = dense_emb + htm_proj_out
@@ -513,6 +609,16 @@ class PostSemClawModel(nn.Module):
             def _block_fn(h, _block=block):
                 return self.drop(_block(norm(h)))
             streams = mhc_layer(streams, _block_fn)
             if i == self.engram_layer_idx:
@@ -565,6 +671,20 @@ class PostSemClawModel(nn.Module):
             smoothing = self.config.label_smoothing
             V = self.config.vocab_size
             # Sampled softmax: instead of computing logits for ALL V tokens,
             # compute only for the target + K random negatives. Reduces the
             # lm_head matmul from (B*T, d) × (d, V) to (B*T, d) × (d, K+1).
@@ -580,10 +700,16 @@ class PostSemClawModel(nn.Module):
                 t_flat = targets.reshape(-1)                    # (B*T,)
                 n = h_flat.shape[0]
                 # Sample K negatives uniformly from [0, V)
                 neg_ids = torch.randint(0, V, (K_neg,), device=x.device)
                 # Gather lm_head weights for target + negatives
-                all_ids = torch.cat([t_flat, neg_ids])          # (B*T + K,)
                 sampled_w = self.lm_head.weight[all_ids]        # (B*T + K, d)
                 # Compute sampled logits: for each position, dot with its
@@ -611,9 +737,20 @@ class PostSemClawModel(nn.Module):
                 # CE with target always at index 0
                 ce_targets = torch.zeros(n, dtype=torch.long, device=x.device)
                 if reduction == 'none':
-                    return F.cross_entropy(all_logits, ce_targets, reduction='none')
-                out = F.cross_entropy(all_logits, ce_targets, reduction='mean',
-                                      label_smoothing=smoothing)
             else:
                 # Full softmax path (eval or HYDRA_SAMPLED_SOFTMAX=0)
                 chunk_size = int(os.environ.get("HYDRA_CE_CHUNK", "1024"))
@@ -658,6 +795,79 @@ class PostSemClawModel(nn.Module):
                     total_loss = total_loss + chunk_loss
                     total_tokens += (chunk_targets != -1).sum()
                 out = total_loss / total_tokens
             if _profile:
                 _t_end = _ev()
                 torch.cuda.synchronize()

 import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mamba_ssm import Mamba3
 from subsystems.hestia_mini import HestiaQAT
 from subsystems.htm import HTMLayer
 from subsystems.mhc_mini import ManifoldHyperConnection
 from subsystems.sdr_semantic import SemanticFoldingSDR
+from hydra.engram import GPUEngram
+from hydra.hyena_block import HyenaBlock
+# GDNBlock is imported lazily inside __init__ so the `fla` dependency is
+# only required when HYDRA_GDN_LAYERS is actually non-empty. Baseline
+# pure-Mamba3 runs continue to work without flash-linear-attention installed.
+from hydra.optimizer import MuonAdamW
 def norm(x: torch.Tensor) -> torch.Tensor:
         model(x, y, reduction='mean')           -> scalar loss
     """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
         # Token embedding
         self.wte = nn.Embedding(config.vocab_size, config.d_model)
         # Mamba-3 blocks — official mamba-ssm fused CUDA kernel. No fallbacks.
         # RoPE is applied internally by the Mamba3 CUDA kernel via the Angles
         # parameter; external cos/sin buffers are not needed.
+        #
+        # Hyena supplement: layers whose index appears in `config.hyena_layers`
+        # are instantiated as HyenaBlock instead of Mamba3. The config field
+        # is populated from HYDRA_HYENA_LAYERS at construction time and then
+        # persisted to checkpoints, so resume is safe even when the env var
+        # is unset. Empty tuple → all-Mamba3, byte-identical to pre-port.
+        _hyena_layer_set = set(getattr(config, "hyena_layers", ()) or ())
+        _gdn_layer_set = set(getattr(config, "gdn_layers", ()) or ())
+        # Hyena wins on overlap; conflict is logged at construction time.
+        _both = _hyena_layer_set & _gdn_layer_set
+        if _both:
+            print(f"[WARN] layers in both hyena_layers and gdn_layers; using Hyena: {sorted(_both)}", flush=True)
+            _gdn_layer_set -= _hyena_layer_set
+        if _gdn_layer_set:
+            from hydra.gdn_block import GDNBlock  # requires `fla` package
+        def _build_block(i: int) -> nn.Module:
+            if i in _hyena_layer_set:
+                return HyenaBlock(
+                    d_model=config.d_model,
+                    seq_len=config.sequence_len,
+                    order=int(os.environ.get("HYDRA_HYENA_ORDER", "2")),
+                    filter_order=int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64")),
+                )
+            if i in _gdn_layer_set:
+                return GDNBlock(
+                    d_model=config.d_model,
+                    n_heads=config.n_heads,
+                )
+            return Mamba3(
+                d_model=config.d_model,
+                d_state=config.d_state,
+                expand=config.expand,
+                headdim=config.headdim,
+                is_mimo=False,          # SISO path uses stable mamba3_siso_combined kernel
+                chunk_size=64,          # upstream-recommended SISO chunk; 16 violated tl.dot M>=16 constraint
+                is_outproj_norm=False,
+                dtype=torch.bfloat16,
+            )
+        self.blocks = nn.ModuleList([_build_block(i) for i in range(config.n_layer)])
         # Full-architecture SDR: offline semantic retina + STE (no-bypass).
         self.sdr_semantic = SemanticFoldingSDR(
         # LM head
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Learnability knob 1: Multi-Token Prediction (Llama-3 style).
+        #   MTP_K=1 -> standard next-token. MTP_K>1 -> extra heads predict
+        #   tokens at positions t+1, t+2, ..., t+K. Heads are weight-tied to
+        #   lm_head (we share Parameters), so the only extra compute is
+        #   additional CE losses; no new params. Activated via HYDRA_MTP_K.
+        self._mtp_k = max(1, int(os.environ.get("HYDRA_MTP_K", "1")))
+        # Learnability knob 3: gradient checkpointing on Mamba3 blocks.
+        self._grad_ckpt = os.environ.get("HYDRA_GRAD_CKPT", "0") == "1"
+        # Learnability knob 4: doc-separator BOS masking in packed sequences.
+        self._doc_sep_mask = os.environ.get("HYDRA_DOC_SEP_MASK", "0") == "1"
+        # BOS token id is looked up lazily on first forward (requires tokenizer
+        # load); -1 means uninitialized.
+        self._bos_token_id = -1
+        # Learnability knob 5: explicit stop-grad on HTM tensor (htm_rust
+        # outputs already have requires_grad=False; this is defense-in-depth).
+        self._htm_stop_grad = os.environ.get("HYDRA_HTM_STOP_GRAD", "0") == "1"
+        # Learnability knob 6: entropy penalty coefficient on LM logits.
+        self._entropy_penalty = float(os.environ.get("HYDRA_ENTROPY_PENALTY", "0.0"))
         # Residual dropout
         self.drop = nn.Dropout(float(os.environ.get("HYDRA_DROPOUT", "0.2")))
         self.htm_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)
+    def set_bos_token_id(self, bos_id: int) -> None:
+        """Inform the model of the tokenizer's BOS id so doc-separator
+        masking (learnability #4) knows which positions to skip. Called from
+        training setup once the tokenizer is loaded."""
+        self._bos_token_id = int(bos_id)
+    def invalidate_hyena_caches(self) -> None:
+        """Invalidate filter-rfft caches on all Hyena blocks.
+        MUST be called after each `optimizer.step()` when
+        `HYDRA_HYENA_FILTER_CACHE=1` is set, otherwise cached rfft values
+        will be reused with stale filter parameters.
+        No-op for blocks that are not HyenaBlock (Mamba3, etc.).
+        """
+        for block in self.blocks:
+            if hasattr(block, "operator") and hasattr(block.operator, "invalidate_filter_cache"):
+                block.operator.invalidate_filter_cache()
+    def flush_hyena_pending_grads(self) -> None:
+        """Push pending train-cache filter gradients into filter params.
+        Used ONLY when HYDRA_HYENA_TRAIN_CACHE=1. Must be called exactly once
+        per optimizer step, BEFORE `optimizer.step()` and BEFORE
+        `invalidate_hyena_caches()`. The lightning_module wires this in
+        `optimizer_step` around the existing optimizer.step() call.
+        No-op if:
+          * No HyenaBlocks are in the model, OR
+          * No micro-batch ever ran with grad enabled (e.g. all-eval step).
+        """
+        for block in self.blocks:
+            if hasattr(block, "operator") and hasattr(block.operator, "flush_pending_filter_grads"):
+                block.operator.flush_pending_filter_grads()
     def estimate_flops(self) -> int:
         nparams = sum(p.numel() for p in self.parameters())
         embed_params = self.wte.weight.numel()
         embedding_params = list(self.wte.parameters())
         lm_head_params = list(self.lm_head.parameters())
+        # Muon routing guard: 2D parameters are NOT automatically matrices.
+        # Exclude:
+        #   (a) params whose name ends in `.freq` — Sin frequency vectors used
+        #       by Hyena's implicit filter MLP. Shape (1, dim) is nominally 2D
+        #       but semantically a per-dim scalar. Muon's polar-express
+        #       orthogonalization would force it toward an orthogonal matrix,
+        #       destroying the learned modulation frequencies.
+        #   (b) 2-D params with min(shape) < MUON_MIN_DIM. Tiny projections
+        #       (e.g. HyenaFilter.implicit_filter.0.weight of shape (64, 3))
+        #       get collapsed toward near-identity by orthogonalization on the
+        #       narrow axis, damaging expressivity. These belong in AdamW.
+        # These exclusions route the params into the AdamW scalar/vector group.
+        MUON_MIN_DIM = 8
+        def _muon_eligible(name: str, p: torch.Tensor) -> bool:
+            if p.dim() != 2:
+                return False
+            if name.endswith(".freq"):
+                return False
+            if min(p.shape) < MUON_MIN_DIM:
+                return False
+            return True
+        # Matrix params -> Muon (2D weight matrices passing the routing guard).
         matrix_params = []
+        for name, p in self.blocks.named_parameters():
+            if _muon_eligible(name, p):
                 matrix_params.append(p)
         # NOTE (W1 audit REG-2): SemanticFoldingSDR.delta_u / delta_v are
         # currently GRADIENT-DEAD. The forward path uses `binary_only(idx)` for
         # for p in self.sdr_semantic.parameters():
         #     if p.dim() == 2:
         #         matrix_params.append(p)
+        for name, p in self.htm_proj.named_parameters():
+            if _muon_eligible(name, p):
                 matrix_params.append(p)
+        for name, p in self.engram.named_parameters():
+            if _muon_eligible(name, p):
                 matrix_params.append(p)
         # SDR params are intentionally not in any optimizer group — they
             sdr_active_bits = float(self.sdr_semantic.target_active)
             htm_anomaly = htm_out[..., -1].mean()
+        # Learnability #5: explicit stop-grad on HTM output. htm_rust already
+        # produces a detached tensor, but making it explicit here hardens the
+        # contract against future refactors that might route HTM through a
+        # grad-enabled op.
+        if self._htm_stop_grad:
+            htm_out = htm_out.detach()
         # Gradient bridge: HTM columns+anomaly -> d_model.
         htm_proj_out = self.htm_proj(htm_out.to(dense_emb.dtype))
         x = dense_emb + htm_proj_out
             def _block_fn(h, _block=block):
                 return self.drop(_block(norm(h)))
+            # Learnability #3: gradient checkpointing. Wrap the block-fn so
+            # the mhc layer's internal uses of it re-run the block in backward
+            # (trading compute for activation memory). use_reentrant=False is
+            # the modern API and works cleanly under autocast.
+            if self._grad_ckpt and self.training:
+                import torch.utils.checkpoint as _ckpt
+                _raw_fn = _block_fn
+                def _block_fn(h, _raw=_raw_fn):  # noqa: E731
+                    return _ckpt.checkpoint(_raw, h, use_reentrant=False)
             streams = mhc_layer(streams, _block_fn)
             if i == self.engram_layer_idx:
             smoothing = self.config.label_smoothing
             V = self.config.vocab_size
+            # Learnability #4: doc-separator masking. In packed rows,
+            # tokenizer.encode(..., prepend=bos_token) places a BOS at every
+            # document boundary. Without masking, the model is penalized for
+            # failing to predict "doc B's BOS" from the last tokens of doc A
+            # — pure noise. We set targets==bos to -1 (ignore_index). Done
+            # BEFORE MTP/entropy/sampled-softmax branches so all downstream
+            # losses inherit the mask.
+            if self._doc_sep_mask and self._bos_token_id >= 0:
+                targets = torch.where(
+                    targets == self._bos_token_id,
+                    torch.full_like(targets, -1),
+                    targets,
+                )
             # Sampled softmax: instead of computing logits for ALL V tokens,
             # compute only for the target + K random negatives. Reduces the
             # lm_head matmul from (B*T, d) × (d, V) to (B*T, d) × (d, K+1).
                 t_flat = targets.reshape(-1)                    # (B*T,)
                 n = h_flat.shape[0]
+                # Learnability #4 hardening: sampled-softmax gather crashes on
+                # negative ids (-1 from doc-sep mask). Replace -1 with 0 for
+                # gather; the actual loss is masked below.
+                valid_mask_flat = (t_flat >= 0)
+                t_flat_safe = torch.where(valid_mask_flat, t_flat, torch.zeros_like(t_flat))
                 # Sample K negatives uniformly from [0, V)
                 neg_ids = torch.randint(0, V, (K_neg,), device=x.device)
                 # Gather lm_head weights for target + negatives
+                all_ids = torch.cat([t_flat_safe, neg_ids])     # (B*T + K,)
                 sampled_w = self.lm_head.weight[all_ids]        # (B*T + K, d)
                 # Compute sampled logits: for each position, dot with its
                 # CE with target always at index 0
                 ce_targets = torch.zeros(n, dtype=torch.long, device=x.device)
                 if reduction == 'none':
+                    per_tok = F.cross_entropy(all_logits, ce_targets, reduction='none')
+                    if self._doc_sep_mask and self._bos_token_id >= 0:
+                        per_tok = torch.where(valid_mask_flat, per_tok, torch.zeros_like(per_tok))
+                    return per_tok
+                per_tok_ce = F.cross_entropy(
+                    all_logits, ce_targets, reduction='none',
+                    label_smoothing=smoothing,
+                )
+                # Mask doc-separator positions. valid_mask_flat is always
+                # computed; when doc_sep_mask is off every token is valid so
+                # this reduces to a plain mean.
+                valid_f = valid_mask_flat.float()
+                valid_n = valid_f.sum().clamp(min=1)
+                out = (per_tok_ce * valid_f).sum() / valid_n
             else:
                 # Full softmax path (eval or HYDRA_SAMPLED_SOFTMAX=0)
                 chunk_size = int(os.environ.get("HYDRA_CE_CHUNK", "1024"))
                     total_loss = total_loss + chunk_loss
                     total_tokens += (chunk_targets != -1).sum()
                 out = total_loss / total_tokens
+            # -----------------------------------------------------------
+            # Learnability #1: Multi-Token Prediction.
+            # For k in {2..K}, add a CE loss at position (t) predicting
+            # the token at position (t+k), using the SAME lm_head weights
+            # (weight-tied). Cost: K-1 extra CEs on a subset of positions.
+            # Only triggered in reduction='mean' path, training only.
+            # -----------------------------------------------------------
+            if reduction == 'mean' and self._mtp_k > 1 and self.training and use_sampled:
+                # TRUE zero-cost MTP: reuse primary's neg_logits (B*T, K_neg)
+                # entirely. Only cost per extra head: O(B*T*d) target-weight
+                # gather + dot product. neg_logits is sliced (view) to match.
+                mtp_loss_sum = out.new_tensor(0.0)
+                mtp_terms = 0
+                # Reshape primary neg_logits back to (B, T, K_neg) so we can slice positions
+                neg_logits_bt = neg_logits.view(B, T, K_neg)
+                for k in range(2, self._mtp_k + 1):
+                    shift = k - 1
+                    if T <= shift:
+                        continue
+                    n_k = B * (T - shift)
+                    h_k_flat = x[:, :T - shift, :].reshape(n_k, -1)  # (n_k, d)
+                    t_k = targets[:, shift:].reshape(-1)             # (n_k,)
+                    mask_k = (t_k >= 0)
+                    t_k_safe = torch.where(mask_k, t_k, torch.zeros_like(t_k))
+                    tgt_w_k = self.lm_head.weight[t_k_safe]          # (n_k, d)
+                    tgt_logit_k = (h_k_flat * tgt_w_k).sum(-1)       # (n_k,)
+                    if not _softcap_clamp:
+                        tgt_logit_k = softcap * torch.tanh(tgt_logit_k / softcap)
+                    # REUSE primary neg_logits — slice positions [:T-shift]
+                    neg_logits_k = neg_logits_bt[:, :T - shift, :].reshape(n_k, K_neg)
+                    all_logits_k = torch.cat([
+                        tgt_logit_k.unsqueeze(-1),
+                        neg_logits_k + log_correction,
+                    ], dim=-1).float()
+                    ce_targets_k = torch.zeros(n_k, dtype=torch.long, device=x.device)
+                    per_tok_ce_k = F.cross_entropy(
+                        all_logits_k, ce_targets_k, reduction='none',
+                        label_smoothing=smoothing,
+                    )
+                    per_tok_ce_k = torch.where(mask_k, per_tok_ce_k, torch.zeros_like(per_tok_ce_k))
+                    n_valid_k = mask_k.sum().clamp(min=1)
+                    mtp_loss_sum = mtp_loss_sum + per_tok_ce_k.sum() / n_valid_k
+                    mtp_terms += 1
+                if mtp_terms > 0:
+                    out = (out + mtp_loss_sum) / float(mtp_terms + 1)
+            # -----------------------------------------------------------
+            # Learnability #6: output entropy penalty.
+            # L += -lambda * H(softmax(logits)). Negative entropy penalizes
+            # peaked distributions; encourages diverse predictions and
+            # breaks repetition loops. Computed on a small subset of
+            # positions to keep V-sized logits cost bounded.
+            # -----------------------------------------------------------
+            if reduction == 'mean' and self._entropy_penalty > 0.0 and self.training:
+                # Sample up to 64 random positions. V-sized logits on 64
+                # positions = 64 * V * 4 bytes (~50 MB at V=200k) — fits
+                # on the 3060 and adds ~2 ms.
+                h_flat = x.reshape(-1, x.shape[-1])
+                n_pos = h_flat.shape[0]
+                n_sample = min(64, n_pos)
+                idx_sample = torch.randint(0, n_pos, (n_sample,), device=x.device)
+                h_sample = h_flat[idx_sample]
+                logits_s = F.linear(h_sample, self.lm_head.weight).float()
+                if _softcap_clamp:
+                    logits_s = torch.clamp(logits_s, -softcap, softcap)
+                else:
+                    logits_s = softcap * torch.tanh(logits_s / softcap)
+                log_probs = F.log_softmax(logits_s, dim=-1)
+                probs = log_probs.exp()
+                entropy = -(probs * log_probs).sum(-1).mean()   # scalar, nats
+                out = out - self._entropy_penalty * entropy
             if _profile:
                 _t_end = _ev()
                 torch.cuda.synchronize()

overlay/hydra/training.py CHANGED Viewed

@@ -27,19 +27,15 @@ except Exception:
     pass
 from hydra.config import (
-    ADAM_BETAS, D_MODEL, D_STATE, DEVICE_BATCH_SIZE, EMBEDDING_LR,
     ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND,
     FINAL_LR_FRAC, GPU_BF16_PEAK_FLOPS, HEADDIM, MATRIX_LR, N_HEADS,
     N_LAYER, PostSemClawConfig, SCALAR_LR, SEED, TOTAL_BATCH_SIZE,
-    UNEMBEDDING_LR, WARMUP_RATIO, WEIGHT_DECAY,
 )
-from hydra.eval import (
-    compute_diversity_metrics,
-    compute_token_calibration,
-    run_factual_english,
-    run_factual_probes,
-    run_instruction_following_proxy,
-)
 from hydra.model import PostSemClawModel
 import prepare as _prepare_mod
@@ -60,9 +56,30 @@ _prepare_mod.TIME_BUDGET = TIME_BUDGET  # sync for evaluate_bpb
 CACHE_DIR = Path.home() / ".cache" / "autoresearch"
 LATEST_CKPT = CACHE_DIR / "latest.pt"
 PRETRAIN_FINAL_CKPT = CACHE_DIR / "pretrain_final.pt"
 CKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "250"))
 RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", str(LATEST_CKPT))
 # ---------------------------------------------------------------------------
 # Schedules
@@ -84,6 +101,35 @@ def get_weight_decay(progress: float) -> float:
     return WEIGHT_DECAY * (1 - progress)
 def save_ckpt(
     model: PostSemClawModel,
     optimizer: torch.optim.Optimizer,
@@ -96,12 +142,29 @@ def save_ckpt(
     path: Path,
     *,
     val_bpb: float | None = None,
 ) -> None:
     try:
         CACHE_DIR.mkdir(parents=True, exist_ok=True)
         payload = {
-            "model_state_dict": model.state_dict(),
-            "optimizer_state_dict": optimizer.state_dict(),
             "config": asdict(config),
             "step": step,
             "epoch": epoch,
@@ -110,10 +173,106 @@ def save_ckpt(
             "bpt_ema": bpt_ema,
             "val_bpb": val_bpb,
         }
-        torch.save(payload, str(path))
-        print(f"[ckpt] saved {path} (step={step})", flush=True)
     except Exception as e:
-        print(f"[ckpt] SAVE FAILED {path}: {type(e).__name__}: {e}", flush=True)
 def maybe_resume_ckpt(
@@ -126,39 +285,28 @@ def maybe_resume_ckpt(
         return 0, 0.0, 0.0, 0.0, 0
     resume_path = Path(os.path.expanduser(RESUME_CKPT))
-    if not resume_path.exists():
-        print(f"[ckpt] no resume checkpoint at {resume_path}; starting fresh", flush=True)
-        return 0, 0.0, 0.0, 0.0, 0
-    try:
-        ckpt = torch.load(str(resume_path), map_location=device, weights_only=False)
-        state = ckpt.get("model_state_dict", ckpt)
-        missing, unexpected = model.load_state_dict(state, strict=False)
-        if missing:
-            print(f"[ckpt] resume missing={len(missing)}", flush=True)
-        if unexpected:
-            print(f"[ckpt] resume unexpected={len(unexpected)}", flush=True)
-        optimizer_state = ckpt.get("optimizer_state_dict")
-        if optimizer_state is not None:
-            try:
-                optimizer.load_state_dict(optimizer_state)
-            except Exception as e:
-                print(f"[ckpt] optimizer restore failed: {type(e).__name__}: {e}", flush=True)
-        step = int(ckpt.get("step", 0))
-        total_training_time = float(ckpt.get("train_seconds", 0.0))
-        smooth_train_loss = float(ckpt.get("smoothed_loss", 0.0))
-        bpt_ema = float(ckpt.get("bpt_ema", 0.0))
-        epoch = int(ckpt.get("epoch", 0))
-        print(
-            f"[ckpt] resumed {resume_path} step={step} train_seconds={total_training_time:.1f}",
-            flush=True,
-        )
-        return step, total_training_time, smooth_train_loss, bpt_ema, epoch
-    except Exception as e:
-        print(f"[ckpt] resume failed from {resume_path}: {type(e).__name__}: {e}", flush=True)
-        return 0, 0.0, 0.0, 0.0, 0
 # ---------------------------------------------------------------------------
@@ -169,7 +317,19 @@ def main() -> None:
     t_start = time.time()
     torch.manual_seed(SEED)
     torch.cuda.manual_seed(SEED)
     torch.set_float32_matmul_precision("high")
     device = torch.device("cuda")
     autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
@@ -231,9 +391,43 @@ def main() -> None:
         model, optimizer, device,
     )
     print("torch.compile: Muon step compiled; AdamW uses torch._fused_adamw_ (model blocks use native CUDA kernels)")
-    train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, MAX_SEQ_LEN, "train")
     x, y, epoch = next(train_loader)  # prefetch first batch
     if resume_epoch > 0:
         epoch = max(epoch, resume_epoch)
@@ -263,16 +457,47 @@ def main() -> None:
         torch.cuda.Stream() if _ASYNC_POSTPROCESS else None
     )
     while True:
         torch.cuda.synchronize()
         t0 = time.time()
         for micro_step in range(grad_accum_steps):
-            with autocast_ctx:
-                loss = model(x, y)
             train_loss = loss.detach()
             loss = loss / grad_accum_steps
             loss.backward()
             x, y, epoch = next(train_loader)
         # Progress and schedules
         progress = min(total_training_time / TIME_BUDGET, 1.0)
@@ -286,6 +511,31 @@ def main() -> None:
                 group["weight_decay"] = muon_weight_decay
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
         optimizer.step()
         # Online SOM update — retina is now a plain Python attribute (not a
         # registered buffer) so mutations do not invalidate torch.compile guards.
@@ -342,6 +592,9 @@ def main() -> None:
         train_loss_f = train_loss.item()
         if math.isnan(train_loss_f) or train_loss_f > 100:
             print("FAIL")
             save_ckpt(
                 model,
                 optimizer,
@@ -351,7 +604,8 @@ def main() -> None:
                 smooth_train_loss,
                 bpt_ema,
                 epoch,
-                LATEST_CKPT,
             )
             raise SystemExit(1)
@@ -359,6 +613,16 @@ def main() -> None:
         t1 = time.time()
         dt = t1 - t0
         if step > 10:
             total_training_time += dt
@@ -412,8 +676,9 @@ def main() -> None:
             gc.collect()
             gc.freeze()
             gc.disable()
-        elif (step + 1) % 5000 == 0:
-            gc.collect()
         if CKPT_INTERVAL > 0 and step > 0 and step % CKPT_INTERVAL == 0:
             save_ckpt(
@@ -435,6 +700,11 @@ def main() -> None:
         if mid_val_interval > 0 and step > 0 and step % mid_val_interval == 0:
             model.eval()
             try:
                 _orig_mid = _prepare_mod.EVAL_TOKENS
                 _prepare_mod.EVAL_TOKENS = 262144  # ~260K tokens, fast
                 with torch.no_grad():
@@ -486,63 +756,81 @@ def main() -> None:
     total_tokens = step * TOTAL_BATCH_SIZE
-    # Final eval (full 40*524288 = 21M tokens)
-    print(f"[VAL] running eval on {4 * 524288} tokens...", flush=True)
-    model.eval()
-    _orig = _prepare_mod.EVAL_TOKENS
-    _prepare_mod.EVAL_TOKENS = 4 * 524288
-    with autocast_ctx:
-        val_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE)
-    _prepare_mod.EVAL_TOKENS = _orig
-    val_ppl = 2 ** val_bpb
-    print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
     save_ckpt(
-        model,
-        optimizer,
-        config,
-        step,
-        total_training_time,
-        smooth_train_loss,
-        bpt_ema,
-        epoch,
-        LATEST_CKPT,
-        val_bpb=val_bpb,
     )
     save_ckpt(
-        model,
-        optimizer,
-        config,
-        step,
-        total_training_time,
-        smooth_train_loss,
-        bpt_ema,
-        epoch,
-        PRETRAIN_FINAL_CKPT,
-        val_bpb=val_bpb,
     )
-    run_factual_probes(model, tokenizer, device, autocast_ctx)
-    factual_english_score, factual_hits, factual_total = run_factual_english(
-        model,
-        tokenizer,
-        MAX_SEQ_LEN,
-    )
-    instruction_score, instruction_hits, instruction_total, instruction_outputs = run_instruction_following_proxy(
-        model,
-        tokenizer,
-        MAX_SEQ_LEN,
-    )
-    diversity_metrics = compute_diversity_metrics(instruction_outputs)
-    calibration_batches = int(os.environ.get("HYDRA_CALIBRATION_BATCHES", "2"))
-    calibration_metrics = compute_token_calibration(
-        model,
-        tokenizer,
-        MAX_SEQ_LEN,
-        DEVICE_BATCH_SIZE,
-        num_batches=calibration_batches,
-    )
-    eval_seed_group = os.environ.get("HYDRA_EVAL_SEED_GROUP", "default")
     t_end = time.time()
     startup_time = t_start_training - t_start
@@ -563,25 +851,11 @@ def main() -> None:
     print(f"total_tokens_M:   {total_tokens / 1e6:.1f}")
     print(f"num_steps:        {step}")
     print(f"num_params_M:     {num_params / 1e6:.1f}")
-    print(f"n_layer:          {N_LAYER}")
-    print(f"d_model:          {D_MODEL}")
-    print(f"factual_english_score: {factual_english_score:.4f}")
-    print(f"factual_english_hits:  {factual_hits}/{factual_total}")
-    print(f"instruction_following_score: {instruction_score:.4f}")
-    print(f"instruction_following_hits:  {instruction_hits}/{instruction_total}")
-    print(f"distinct_1:       {diversity_metrics['distinct_1']:.4f}")
-    print(f"distinct_2:       {diversity_metrics['distinct_2']:.4f}")
-    print(f"repetition_rate:  {diversity_metrics['repetition_rate']:.4f}")
-    print(f"repetition_bigram_rate: {diversity_metrics['repetition_bigram_rate']:.4f}")
-    print(f"calibration_ece:  {calibration_metrics['calibration_ece']:.4f}")
-    print(f"calibration_brier:{calibration_metrics['calibration_brier']:.4f}")
-    print(f"calibration_accuracy: {calibration_metrics['calibration_accuracy']:.4f}")
-    print(f"calibration_tokens: {int(calibration_metrics['calibration_tokens'])}")
-    print(f"eval_seed:        {SEED}")
-    print(f"eval_seed_group:  {eval_seed_group}")
-    print(f"engram_hit_rate:   {metrics.get('engram_hit_rate', 0.0):.4f}")
-    print(f"sdr_active_bits:  {metrics.get('sdr_active_bits', 0):.1f}")
-    print(f"htm_anomaly:      {metrics.get('htm_anomaly', 0):.4f}")
     # Per-layer summary panel — only printed when diagnostics were active.
     _layer_keys = sorted([k for k in metrics.keys() if k.startswith('layer_')])
@@ -605,28 +879,12 @@ def main() -> None:
     _metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
     try:
         _dump = dict(metrics)
-        _dump.update({
-            'val_bpb': float(val_bpb),
-            'val_ppl': float(val_ppl),
-            'factual_english_score': float(factual_english_score),
-            'factual_english_hits': int(factual_hits),
-            'factual_english_total': int(factual_total),
-            'instruction_following_score': float(instruction_score),
-            'instruction_following_hits': int(instruction_hits),
-            'instruction_following_total': int(instruction_total),
-            'distinct_1': float(diversity_metrics['distinct_1']),
-            'distinct_2': float(diversity_metrics['distinct_2']),
-            'repetition_rate': float(diversity_metrics['repetition_rate']),
-            'repetition_bigram_rate': float(diversity_metrics['repetition_bigram_rate']),
-            'calibration_ece': float(calibration_metrics['calibration_ece']),
-            'calibration_brier': float(calibration_metrics['calibration_brier']),
-            'calibration_accuracy': float(calibration_metrics['calibration_accuracy']),
-            'calibration_tokens': int(calibration_metrics['calibration_tokens']),
-            'eval_seed': int(SEED),
-            'eval_seed_group': str(eval_seed_group),
-            'n_layer': int(N_LAYER),
-            'd_model': int(D_MODEL),
-            'num_params_M': float(num_params / 1e6),
             'num_steps': int(step),
             'total_tokens_M': float(total_tokens / 1e6),
             'peak_vram_mb': float(peak_vram_mb),
@@ -643,5 +901,6 @@ def main() -> None:
     except Exception as _e:
         print(f"[METRICS] write failed: {_e}", flush=True)
-    # startup_time is informative but not printed (preserve historical output)
-    _ = startup_time

     pass
 from hydra.config import (
+    ADAM_BETAS, CURRICULUM_SHORT_SEQ_LEN, CURRICULUM_SHORT_STEPS,
+    D_MODEL, D_STATE, DEVICE_BATCH_SIZE, EMA_DECAY, EMBEDDING_LR,
     ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND,
     FINAL_LR_FRAC, GPU_BF16_PEAK_FLOPS, HEADDIM, MATRIX_LR, N_HEADS,
     N_LAYER, PostSemClawConfig, SCALAR_LR, SEED, TOTAL_BATCH_SIZE,
+    UNEMBEDDING_LR, USE_EMA, WARMUP_RATIO, WEIGHT_DECAY,
 )
+from hydra.diffusion_loss import mdlm_masked_forward_process, mdlm_rb_loss
+from hydra.eval import run_factual_english, run_factual_probes
 from hydra.model import PostSemClawModel
 import prepare as _prepare_mod
 CACHE_DIR = Path.home() / ".cache" / "autoresearch"
 LATEST_CKPT = CACHE_DIR / "latest.pt"
 PRETRAIN_FINAL_CKPT = CACHE_DIR / "pretrain_final.pt"
+FAILED_CKPT = CACHE_DIR / "latest_failed.pt"          # crash/FAIL path — never overwrites good
+BEST_CKPT = CACHE_DIR / "best_bpb.pt"                 # lowest val_bpb seen
 CKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "250"))
+CKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))  # how many .N backups to keep
 RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", str(LATEST_CKPT))
+# MDLM (Masked Diffusion LM) Rao-Blackwellized ELBO loss path.
+#   HYDRA_USE_MDLM=1         : switch training loss from AR sampled-softmax CE
+#                              to MDLM RB weighted CE (arXiv:2406.07524).
+#   HYDRA_MDLM_MASK_ID=N     : token id used for the MASK sentinel (default:
+#                              last valid id, vocab_size - 1). Ensure this id
+#                              never appears in training targets — typical
+#                              practice is to reserve it.
+#   HYDRA_MDLM_SCHEDULE=loglinear|linear  : noise schedule (default loglinear).
+# When enabled, the per-step flow is:
+#   1. mdlm_masked_forward_process(y)  ->  (x_noised, mask_positions, weights)
+#   2. logits = model(x_noised)                          (no targets -> full V logits)
+#   3. loss = mdlm_rb_loss(logits, y, mask_positions, weights)
+# Sampled-softmax is bypassed in this path because the RB ELBO needs
+# full-vocab logits on masked positions.
+USE_MDLM = os.environ.get("HYDRA_USE_MDLM", "0") == "1"
+MDLM_MASK_ID = int(os.environ.get("HYDRA_MDLM_MASK_ID", "-1"))  # -1 => default to vocab_size-1 at runtime
+MDLM_SCHEDULE = os.environ.get("HYDRA_MDLM_SCHEDULE", "loglinear")
 # ---------------------------------------------------------------------------
 # Schedules
     return WEIGHT_DECAY * (1 - progress)
+_CKPT_WORKER_THREAD: threading.Thread | None = None
+def _ckpt_snapshot_state_dicts(
+    model: PostSemClawModel,
+    optimizer: torch.optim.Optimizer,
+) -> tuple[dict, dict]:
+    """Detach + CPU-clone every tensor so a bg thread can serialize safely
+    while the main loop keeps mutating live weights/optimizer state."""
+    msd = {k: (v.detach().to("cpu", copy=True) if torch.is_tensor(v) else v)
+           for k, v in model.state_dict().items()}
+    # optimizer.state_dict() is a nested dict; walk it.
+    osd_raw = optimizer.state_dict()
+    def _to_cpu(obj):
+        if torch.is_tensor(obj):
+            return obj.detach().to("cpu", copy=True)
+        if isinstance(obj, dict):
+            return {k: _to_cpu(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_to_cpu(v) for v in obj]
+        if isinstance(obj, tuple):
+            return tuple(_to_cpu(v) for v in obj)
+        return obj
+    osd = _to_cpu(osd_raw)
+    return msd, osd
 def save_ckpt(
     model: PostSemClawModel,
     optimizer: torch.optim.Optimizer,
     path: Path,
     *,
     val_bpb: float | None = None,
+    blocking: bool = False,
 ) -> None:
+    """Save a training checkpoint.
+    Default behavior is async: the GPU→CPU state_dict clone runs on the main
+    thread (unavoidable; needs to happen before the next optimizer.step that
+    mutates live weights), then `torch.save` is dispatched to a daemon
+    worker thread. The next call joins any still-running prior save so only
+    one disk write is in flight.
+    `blocking=True` restores the original synchronous behavior — used for
+    end-of-training saves where correctness on process exit matters.
+    """
+    global _CKPT_WORKER_THREAD
     try:
         CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        msd, osd = _ckpt_snapshot_state_dicts(model, optimizer)
+        # asdict() recursively converts dataclass fields to a dict and
+        # renders tuples as lists. hyena_layers therefore round-trips as a
+        # JSON-safe list; config_from_dict normalizes it back to a tuple.
         payload = {
+            "model_state_dict": msd,
+            "optimizer_state_dict": osd,
             "config": asdict(config),
             "step": step,
             "epoch": epoch,
             "bpt_ema": bpt_ema,
             "val_bpb": val_bpb,
         }
+        path_str = str(path)
+        def _rotate(p: str) -> None:
+            """Keep up to CKPT_ROTATIONS previous versions as p.1, p.2, ..."""
+            if CKPT_ROTATIONS <= 0:
+                return
+            try:
+                # Walk from oldest to newest so we don't clobber newer with older.
+                for i in range(CKPT_ROTATIONS, 0, -1):
+                    src = f"{p}.{i-1}" if i > 1 else p
+                    dst = f"{p}.{i}"
+                    if os.path.exists(src):
+                        os.replace(src, dst)
+            except Exception as e:
+                # Rotation is best-effort; never block a save on it.
+                print(f"[ckpt] rotate warn {p}: {type(e).__name__}: {e}", flush=True)
+        def _write():
+            try:
+                _rotate(path_str)
+                tmp = path_str + ".tmp"
+                torch.save(payload, tmp)
+                os.replace(tmp, path_str)
+                print(f"[ckpt] saved {path_str} (step={step})", flush=True)
+            except Exception as e:
+                print(f"[ckpt] SAVE FAILED {path_str}: {type(e).__name__}: {e}", flush=True)
+        if blocking:
+            _write()
+            return
+        # Join previous writer so at most one torch.save runs at a time.
+        if _CKPT_WORKER_THREAD is not None and _CKPT_WORKER_THREAD.is_alive():
+            _CKPT_WORKER_THREAD.join()
+        _CKPT_WORKER_THREAD = threading.Thread(
+            target=_write, daemon=True, name=f"ckpt-save-{step}"
+        )
+        _CKPT_WORKER_THREAD.start()
     except Exception as e:
+        print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
+def config_from_dict(cfg_dict: dict) -> PostSemClawConfig:
+    """Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
+    Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
+    older checkpoints, and list-ified tuples are coerced back to tuples so
+    the dataclass keeps its declared types.
+    This is the ckpt-safe inverse of `asdict(config)` used by save_ckpt and
+    guarantees that a resume path can rebuild the exact same model topology
+    (Mamba3 vs HyenaBlock per layer) regardless of env-var state at resume.
+    """
+    # Only keep keys that are actually declared on PostSemClawConfig — extra
+    # keys in older/newer checkpoints must not crash construction.
+    field_names = {f.name for f in PostSemClawConfig.__dataclass_fields__.values()}
+    filtered = {k: v for k, v in cfg_dict.items() if k in field_names}
+    # asdict renders tuple[int,...] as list[int]; coerce back so the model
+    # builder sees the declared type.
+    if "hyena_layers" in filtered and filtered["hyena_layers"] is not None:
+        filtered["hyena_layers"] = tuple(sorted(int(x) for x in filtered["hyena_layers"]))
+    return PostSemClawConfig(**filtered)
+def _try_load_ckpt(path: Path, model, optimizer, device):
+    """Attempt to load a single ckpt. Returns the tuple on success, None on any failure."""
+    if not path.exists():
+        return None
+    ckpt = torch.load(str(path), map_location=device, weights_only=False)
+    state = ckpt.get("model_state_dict", ckpt)
+    missing, unexpected = model.load_state_dict(state, strict=False)
+    if missing:
+        print(f"[ckpt] {path.name} missing={len(missing)}", flush=True)
+    if unexpected:
+        print(f"[ckpt] {path.name} unexpected={len(unexpected)}", flush=True)
+    optimizer_state = ckpt.get("optimizer_state_dict")
+    if optimizer_state is not None:
+        try:
+            optimizer.load_state_dict(optimizer_state)
+        except Exception as e:
+            print(f"[ckpt] optimizer restore failed from {path.name}: {type(e).__name__}: {e}", flush=True)
+    step = int(ckpt.get("step", 0))
+    total_training_time = float(ckpt.get("train_seconds", 0.0))
+    smooth_train_loss = float(ckpt.get("smoothed_loss", 0.0))
+    bpt_ema = float(ckpt.get("bpt_ema", 0.0))
+    epoch = int(ckpt.get("epoch", 0))
+    print(
+        f"[ckpt] resumed {path} step={step} train_seconds={total_training_time:.1f}",
+        flush=True,
+    )
+    # Warn if resuming a schedule-exhausted ckpt — user is probably warm-starting.
+    budget = float(os.environ.get("HYDRA_TIME_BUDGET", "0") or 0)
+    if budget and total_training_time >= 0.99 * budget:
+        print(
+            f"[ckpt] WARNING: resumed ckpt used {total_training_time:.0f}s of {budget:.0f}s "
+            f"budget. LR schedule is essentially exhausted. "
+            f"Set HYDRA_WARMSTART=1 to reset optimizer + scheduler and keep only weights.",
+            flush=True,
+        )
+    return step, total_training_time, smooth_train_loss, bpt_ema, epoch
 def maybe_resume_ckpt(
         return 0, 0.0, 0.0, 0.0, 0
     resume_path = Path(os.path.expanduser(RESUME_CKPT))
+    # Try the primary path, then rotated backups. This is crucial because a
+    # partial / killed torch.save on the primary path would leave a corrupt
+    # file. If that fails we fall back to latest.pt.1, .2, .3 automatically.
+    candidates: list[Path] = [resume_path]
+    for i in range(1, CKPT_ROTATIONS + 1):
+        candidates.append(Path(str(resume_path) + f".{i}"))
+    for cand in candidates:
+        if not cand.exists():
+            continue
+        try:
+            result = _try_load_ckpt(cand, model, optimizer, device)
+            if result is not None:
+                if cand != resume_path:
+                    print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
+                return result
+        except Exception as e:
+            print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
+            continue
+    print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
+    return 0, 0.0, 0.0, 0.0, 0
 # ---------------------------------------------------------------------------
     t_start = time.time()
     torch.manual_seed(SEED)
     torch.cuda.manual_seed(SEED)
+    # Precision / kernel-selection knobs for peak throughput on Ampere.
+    # - high : matmul uses TF32 (Ampere's 10-bit mantissa accum) for fp32 ops
+    # - allow_tf32 : explicit for both matmul + cudnn paths
+    # - cudnn.benchmark : env-gated (HYDRA_CUDNN_BENCHMARK, default OFF).
+    #   TRUE can lock in a locally-better-but-globally-slower algorithm
+    #   after the autotune phase ends, causing tps to degrade 15-20%
+    #   over the first ~100 steps. Observed 2026-04-22 and confirmed by
+    #   differential profiling. Default is now FALSE; set =1 only if you
+    #   see a specific workload where benchmark helps sustained tps.
     torch.set_float32_matmul_precision("high")
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = os.environ.get("HYDRA_CUDNN_BENCHMARK", "0") == "1"
     device = torch.device("cuda")
     autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
         model, optimizer, device,
     )
+    # Learnability #4: inform the model of the BOS token id so it can mask
+    # doc-separator positions in packed sequences. Always set (the mask only
+    # fires when HYDRA_DOC_SEP_MASK=1 is also on).
+    if hasattr(model, 'set_bos_token_id'):
+        model.set_bos_token_id(tokenizer.get_bos_token_id())
+    # Learnability #2: EMA shadow copy of weights. AveragedModel clones every
+    # parameter; we update it after every optimizer step and save it at the
+    # end alongside the raw checkpoint. Defaults OFF.
+    ema_model = None
+    if USE_EMA:
+        try:
+            from torch.optim.swa_utils import AveragedModel, get_ema_multi_avg_fn
+            # decay=EMA_DECAY; avg_fn uses get_ema_multi_avg_fn for numerical
+            # stability across bf16/fp32 mixed parameter groups.
+            ema_model = AveragedModel(
+                model,
+                multi_avg_fn=get_ema_multi_avg_fn(EMA_DECAY),
+            )
+            print(f"[EMA] enabled with decay={EMA_DECAY}")
+        except Exception as _e:
+            print(f"[EMA] disabled — AveragedModel init failed: {_e}")
+            ema_model = None
     print("torch.compile: Muon step compiled; AdamW uses torch._fused_adamw_ (model blocks use native CUDA kernels)")
+    # Learnability #7: curriculum short-then-long. If enabled, build the
+    # initial dataloader at the short seq_len; we swap to full MAX_SEQ_LEN
+    # after CURRICULUM_SHORT_STEPS optimizer steps (see loop below).
+    _curriculum_active = CURRICULUM_SHORT_STEPS > 0 and CURRICULUM_SHORT_SEQ_LEN < MAX_SEQ_LEN
+    _current_seq_len = CURRICULUM_SHORT_SEQ_LEN if _curriculum_active else MAX_SEQ_LEN
+    if _curriculum_active:
+        print(
+            f"[CURRICULUM] starting at T={_current_seq_len} for "
+            f"{CURRICULUM_SHORT_STEPS} steps, then switching to T={MAX_SEQ_LEN}"
+        )
+    train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
     x, y, epoch = next(train_loader)  # prefetch first batch
     if resume_epoch > 0:
         epoch = max(epoch, resume_epoch)
         torch.cuda.Stream() if _ASYNC_POSTPROCESS else None
     )
+    # HYDRA_PROFILE_STEPS=N prints a per-phase cpu/gpu time breakdown for the
+    # first N steps (and every 100th step thereafter if N<0). Zero overhead
+    # when disabled. Used to find what's eating CPU budget when GPU should
+    # be the bottleneck.
+    _profile_steps = int(os.environ.get("HYDRA_PROFILE_STEPS", "0"))
     while True:
         torch.cuda.synchronize()
         t0 = time.time()
+        _prof = _profile_steps and (step < _profile_steps or (_profile_steps < 0 and step % 100 == 0))
+        _gpu_ms = 0.0
+        _data_ms = 0.0
         for micro_step in range(grad_accum_steps):
+            if _prof:
+                torch.cuda.synchronize(); _t_micro = time.time()
+            if USE_MDLM:
+                # MDLM path: corrupt y -> x_noised, run model to get full-V logits,
+                # compute RB weighted CE on masked positions. x (original input) is
+                # unused in this path — the model only sees the noised version of y.
+                _mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)
+                x_noised, mask_positions, loss_weights = mdlm_masked_forward_process(
+                    y, mask_token_id=_mask_id, alpha_schedule=MDLM_SCHEDULE,
+                )
+                with autocast_ctx:
+                    logits = model(x_noised)  # targets=None -> (B, T, V) logits
+                loss = mdlm_rb_loss(logits, y, mask_positions, loss_weights)
+            else:
+                with autocast_ctx:
+                    loss = model(x, y)
             train_loss = loss.detach()
             loss = loss / grad_accum_steps
             loss.backward()
+            if _prof:
+                torch.cuda.synchronize()
+                _gpu_ms += (time.time() - _t_micro) * 1000
+                _t_data = time.time()
             x, y, epoch = next(train_loader)
+            if _prof:
+                _data_ms += (time.time() - _t_data) * 1000
+        if _prof:
+            torch.cuda.synchronize(); _t_fb = time.time()
         # Progress and schedules
         progress = min(total_training_time / TIME_BUDGET, 1.0)
                 group["weight_decay"] = muon_weight_decay
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
         optimizer.step()
+        if _prof:
+            torch.cuda.synchronize(); _t_opt = time.time()
+        # Learnability #2: EMA update after every optimizer step.
+        if ema_model is not None:
+            try:
+                ema_model.update_parameters(model)
+            except Exception as _e:
+                print(f"[EMA] update failed at step {step}: {_e}", flush=True)
+        # Learnability #7: curriculum transition. After
+        # CURRICULUM_SHORT_STEPS optimizer steps, rebuild the dataloader at
+        # MAX_SEQ_LEN. Done once, then the flag flips off.
+        if _curriculum_active and step + 1 >= CURRICULUM_SHORT_STEPS:
+            print(
+                f"[CURRICULUM] step={step+1} — switching from T={_current_seq_len} "
+                f"to T={MAX_SEQ_LEN}",
+                flush=True,
+            )
+            _current_seq_len = MAX_SEQ_LEN
+            _curriculum_active = False
+            train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
+            # Prefetch the next batch at the new seq_len so the following
+            # loop iteration consumes fresh data.
+            x, y, epoch = next(train_loader)
         # Online SOM update — retina is now a plain Python attribute (not a
         # registered buffer) so mutations do not invalidate torch.compile guards.
         train_loss_f = train_loss.item()
         if math.isnan(train_loss_f) or train_loss_f > 100:
             print("FAIL")
+            # Save to a DIFFERENT file — never clobber a good latest.pt with
+            # a NaN/diverged state. The good ckpt from the last periodic save
+            # is the right place to resume from.
             save_ckpt(
                 model,
                 optimizer,
                 smooth_train_loss,
                 bpt_ema,
                 epoch,
+                FAILED_CKPT,
+                blocking=True,
             )
             raise SystemExit(1)
         t1 = time.time()
         dt = t1 - t0
+        if _prof:
+            fb = (_t_fb - t0) * 1000
+            opt = (_t_opt - _t_fb) * 1000
+            rest = (t1 - _t_opt) * 1000
+            print(
+                f"[PROF step={step:05d}] gpu={_gpu_ms:.0f}ms data_fetch={_data_ms:.0f}ms "
+                f"(sum_fb={fb:.0f}) opt={opt:.0f}ms rest={rest:.0f}ms total={dt*1000:.0f}ms",
+                flush=True,
+            )
         if step > 10:
             total_training_time += dt
             gc.collect()
             gc.freeze()
             gc.disable()
+        # No periodic gc.collect() — we disabled+froze at step 0 on purpose,
+        # so a manual collect every 5k steps just re-scans frozen objects
+        # (burned ~900 ms/event in production) for no live-garbage reason.
         if CKPT_INTERVAL > 0 and step > 0 and step % CKPT_INTERVAL == 0:
             save_ckpt(
         if mid_val_interval > 0 and step > 0 and step % mid_val_interval == 0:
             model.eval()
             try:
+                # Defrag GPU memory before eval allocates fresh chunks —
+                # without this the eval path can OOM on 6GB cards even
+                # though total usage fits, because the allocator's free
+                # blocks are fragmented.
+                torch.cuda.empty_cache()
                 _orig_mid = _prepare_mod.EVAL_TOKENS
                 _prepare_mod.EVAL_TOKENS = 262144  # ~260K tokens, fast
                 with torch.no_grad():
     total_tokens = step * TOTAL_BATCH_SIZE
+    # ----------------------------------------------------------------------
+    # SAVE ORDER (critical):
+    #   1. Save PRETRAIN_FINAL_CKPT with val_bpb=None  (hedge against eval OOM)
+    #   2. Save LATEST_CKPT with val_bpb=None          (hedge against eval OOM)
+    #   3. Run eval (may OOM on small GPUs; we survive it)
+    #   4. Re-save both ckpts with val_bpb filled in
+    # This way we NEVER lose the final trained weights to an eval crash.
+    # Previous ordering put eval first, so an eval-time OOM destroyed the
+    # only record of a 6h training run (2026-04-22 incident).
+    # ----------------------------------------------------------------------
+    save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, PRETRAIN_FINAL_CKPT,
+        val_bpb=None, blocking=True,
+    )
+    save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, LATEST_CKPT,
+        val_bpb=None, blocking=True,
+    )
+    # Now it's safe to eval — ckpts are on disk regardless of what happens here.
+    val_bpb: float | None = None
+    try:
+        torch.cuda.empty_cache()  # defrag before eval allocates logit chunks
+        print(f"[VAL] running eval on {4 * 524288} tokens...", flush=True)
+        model.eval()
+        _orig = _prepare_mod.EVAL_TOKENS
+        _prepare_mod.EVAL_TOKENS = 4 * 524288
+        with autocast_ctx:
+            val_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE)
+        _prepare_mod.EVAL_TOKENS = _orig
+        val_ppl = 2 ** val_bpb
+        print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
+    except torch.cuda.OutOfMemoryError as e:
+        print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
+        torch.cuda.empty_cache()
+    except Exception as e:
+        print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
+    # Final ckpts with val_bpb filled in (if eval succeeded).
     save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, LATEST_CKPT,
+        val_bpb=val_bpb, blocking=True,
     )
     save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, PRETRAIN_FINAL_CKPT,
+        val_bpb=val_bpb, blocking=True,
     )
+    # Learnability #2: persist EMA weights alongside the raw checkpoint.
+    # latest_ema.pt contains ema_model.module (the Averaged params) so it
+    # can be loaded by evaluation / inference code that expects the same
+    # state_dict shape as the raw model.
+    if ema_model is not None:
+        try:
+            ema_ckpt_path = CACHE_DIR / "latest_ema.pt"
+            CACHE_DIR.mkdir(parents=True, exist_ok=True)
+            torch.save({
+                "model_state_dict": ema_model.module.state_dict(),
+                "config": asdict(config),
+                "step": step,
+                "epoch": epoch,
+                "train_seconds": total_training_time,
+                "val_bpb": val_bpb,
+                "ema_decay": EMA_DECAY,
+            }, str(ema_ckpt_path))
+            print(f"[EMA] saved {ema_ckpt_path} (step={step})", flush=True)
+        except Exception as _e:
+            print(f"[EMA] save failed: {_e}", flush=True)
+    run_factual_probes(model, tokenizer, device, autocast_ctx)
     t_end = time.time()
     startup_time = t_start_training - t_start
     print(f"total_tokens_M:   {total_tokens / 1e6:.1f}")
     print(f"num_steps:        {step}")
     print(f"num_params_M:     {num_params / 1e6:.1f}")
+    print(f"n_layer:          {N_LAYER}")
+    print(f"d_model:          {D_MODEL}")
+    print(f"engram_hit_rate:   {metrics.get('engram_hit_rate', 0.0):.4f}")
+    print(f"sdr_active_bits:  {metrics.get('sdr_active_bits', 0):.1f}")
+    print(f"htm_anomaly:      {metrics.get('htm_anomaly', 0):.4f}")
     # Per-layer summary panel — only printed when diagnostics were active.
     _layer_keys = sorted([k for k in metrics.keys() if k.startswith('layer_')])
     _metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
     try:
         _dump = dict(metrics)
+        _dump.update({
+            'val_bpb': float(val_bpb),
+            'val_ppl': float(val_ppl),
+            'n_layer': int(N_LAYER),
+            'd_model': int(D_MODEL),
+            'num_params_M': float(num_params / 1e6),
             'num_steps': int(step),
             'total_tokens_M': float(total_tokens / 1e6),
             'peak_vram_mb': float(peak_vram_mb),
     except Exception as _e:
         print(f"[METRICS] write failed: {_e}", flush=True)
+    run_factual_english(model, tokenizer, MAX_SEQ_LEN)
+    # startup_time is informative but not printed (preserve historical output)
+    _ = startup_time

overlay/kernels/__init__.py ADDED Viewed

File without changes

overlay/kernels/cuda/decode_kernels.cu ADDED Viewed

	@@ -0,0 +1,10 @@

+/*
+ * CuTe DSL decode kernels for Mamba-3 autoregressive generation.
+ *
+ * Phase 2: Optimized single-token SSM step for inference.
+ * Phase 1: Not needed (training only, no generation).
+ *
+ * Fuses: input_proj + conv_step + ssm_step + output_proj
+ * into a single kernel launch for minimal latency.
+ */
+// Stub: Phase 2 implementation

overlay/kernels/cuda/flashfftconv/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

overlay/kernels/cuda/flashfftconv/README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+# flashfftconv (vendored)
+Vendored from https://github.com/HazyResearch/flash-fft-conv (Apache 2.0 license).
+**Upstream commit:** see `UPSTREAM_COMMIT`.
+## What this is
+HazyResearch's Monarch-matrix-decomposition FFT convolution CUDA kernel. Provides a
+drop-in replacement for `torch.fft.rfft + complex-mult + irfft` that runs ~2-3x
+faster than cuFFT for the specific power-of-two lengths it supports (256, 512,
+1024, 2048, 4096, 8192, ..., up to 4M).
+In HYDRA, we use it to accelerate `subsystems/hyena_pure.fftconv_ref`. The
+accelerated path is opt-in via `HYDRA_HYENA_FLASH_FFT=1`; default behavior is
+unchanged (pure PyTorch fallback).
+## How to build
+The vendored tree contains:
+- `flashfftconv/` — pure-Python wrappers (imports `monarch_cuda` CUDA extension)
+- `csrc/` — CUDA source files and setup.py for the native extension
+Build instructions:
+```bash
+cd /home/mikeb/work/feather/kernels/cuda/flashfftconv/csrc
+# Edit `csrc/setup.py` first: change the cc_flag line to match your GPU arch
+# (RTX 3060 = 8.6, A100 = 8.0, H100 = 9.0). Example for RTX 3060:
+#   cc_flag = ['--generate-code=arch=compute_86,code=compute_86']
+# Build with the local CUDA toolchain (must match your torch.version.cuda):
+CUDA_HOME=/usr/local/cuda-12.1 .venv/bin/pip install -e .
+```
+Then install the Python wrappers:
+```bash
+cd /home/mikeb/work/feather/kernels/cuda/flashfftconv
+.venv/bin/pip install -e .
+```
+## Runtime usage
+Once installed, set `HYDRA_HYENA_FLASH_FFT=1` and training will use it.
+`subsystems/hyena_pure.fftconv_ref` auto-detects via `try: import flashfftconv`
+and falls back to pure PyTorch on import failure.
+## Known caveats
+- Seqlen must be a power of 2 AND in the supported set: {256, 512, 1024, 2048,
+  4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304}.
+  For HYDRA, `fft_size = 2 * seq_len` → seq_len in {128, 256, 512, 1024, 2048, ...}.
+- dtype must be fp16 or bf16 (fp32 not supported).
+- GPU arch must be compiled into the extension (see setup.py cc_flag).
+- CUDA toolchain major.minor should match `torch.version.cuda` major (12.x ↔ 12.x).

overlay/kernels/cuda/flashfftconv/UPSTREAM_COMMIT ADDED Viewed

	@@ -0,0 +1 @@


1	+ b8771028717f46d5b22cbb8e12833f35033d621b

overlay/kernels/cuda/flashfftconv/csrc/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+*.npy
+*.json
+*.png
+*/*.npy
+*/*.json
+*/*.png
+*.DS_Store
+*/*.DS_Store

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly.h ADDED Viewed

	@@ -0,0 +1,374 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_IS_HALF_OR_BFLOAT(x) TORCH_CHECK(x.dtype() == torch::kFloat16 || x.dtype() == torch::kBFloat16, #x " must be float16 or bfloat16")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_IS_HALF_OR_BFLOAT(x)
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+std::vector<torch::Tensor> butterfly_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_padded_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_padded_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_padded_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_padded_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag);
+}
+std::vector<torch::Tensor> butterfly_gated(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(x_gate);
+    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, x_gate);
+}
+std::vector<torch::Tensor> butterfly_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag);
+}
+std::vector<torch::Tensor> butterfly_gated_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    CHECK_INPUT(x_gate);
+    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, x_gate);
+}
+torch::Tensor butterfly_ifft(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag);
+}
+torch::Tensor butterfly_ifft_gated(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(out_gate);
+    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag, out_gate);
+}
+torch::Tensor butterfly_ifft_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag);
+}
+torch::Tensor butterfly_ifft_gated_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(out_gate);
+    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, out_gate);
+}
+std::vector<torch::Tensor> butterfly_padded(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M);
+}
+std::vector<torch::Tensor> butterfly_padded_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M);
+}
+std::vector<torch::Tensor> butterfly_padded_gated(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
+}
+std::vector<torch::Tensor> butterfly_padded_gated_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
+}
+torch::Tensor butterfly_ifft_padded(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N);
+}
+torch::Tensor butterfly_ifft_padded_gated(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
+}
+torch::Tensor butterfly_ifft_padded_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N);
+}
+torch::Tensor butterfly_ifft_padded_gated_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda.cu ADDED Viewed

	@@ -0,0 +1,699 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_cuda_kernel_64(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half x_shared[];
+    half *d_f_real = &x_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            if(x_gate != nullptr){
+                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+            }else{
+                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+            out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_32(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    __shared__ half out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate == nullptr){
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+        }else{
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], __float2half(0.0f));
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]));
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+    }
+}
+__global__ void butterfly_cuda_kernel_128(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * gridDim.x * 2 + blockIdx.z * 16 * 128 * 32 * gridDim.x * 2 + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half shared_real[];
+    half *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[8][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
+            shared_real[shared_offset] = d_f[shared_offset].real();
+            shared_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__half2*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+                }else{
+                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = x[offset + idx];
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            __half2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+                }
+                wmma::store_matrix_sync(shared_real + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+                wmma::store_matrix_sync(shared_imag + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[offset + idx] = reinterpret_cast<__half2*>(shared_real)[shared_offset];
+                    out_imag[offset + idx] = reinterpret_cast<__half2*>(shared_imag)[shared_offset];
+                }
+            }
+            __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_16(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_shared[16 * 64];
+    __shared__ half d_f_real[16 * 16];
+    __shared__ half d_f_imag[16 * 16];
+    __shared__ half twiddles_real_shared[16 * 64];
+    __shared__ half twiddles_imag_shared[16 * 64];
+    __shared__ half out_real_shared[16 * 64];
+    __shared__ half out_imag_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != NULL)
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        else
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = reinterpret_cast<__half2 *>(acc_frag_real.x)[k];
+            tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<__half2 *>(acc_frag_real.x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]));
+            reinterpret_cast<__half2 *>(acc_frag_imag.x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]));
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+    }
+}
+std::vector<torch::Tensor> butterfly_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt)
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    uint N = x.size(2);
+    uint M = x.size(3);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_64<<<gridDim, blockDim, 57344>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    default:
+    printf("Not yet implemented \n");
+        break;
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,725 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 x_shared[];
+    __nv_bfloat16 *d_f_real_shared = &x_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    float *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    float2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            if(x_gate != nullptr){
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+            }else{
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], 0.0f);
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+            out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_shared[32 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    __shared__ float out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], 0.0f);
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                 for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]);
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+    }
+}
+__global__ void butterfly_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 shared_real[];
+    __nv_bfloat16 *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[8][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162 *>(shared_real)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162 *>(shared_imag)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__nv_bfloat162*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+                }else{
+                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = x[offset + idx];
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], 0.0f);
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            float2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+            __syncthreads();
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_imag[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+    }
+}
+__global__ void butterfly_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_shared[16 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[16 * 16];
+    __shared__ __nv_bfloat16 d_f_imag_shared[16 * 16];
+    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
+    __shared__ float out_real_shared[16 * 64];
+    __shared__ float out_imag_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+            d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real_shared, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag_shared, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, 0.0f);
+         wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+#pragma unroll
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real.x)[k];
+            tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<float2 *>(acc_frag_real.x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]);
+            reinterpret_cast<float2 *>(acc_frag_imag.x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]);
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+    }
+}
+std::vector<torch::Tensor> butterfly_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    uint N = x.size(2);
+    uint M = x.size(3);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+        butterfly_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    default:
+    printf("Not yet implemented \n");
+        break;
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda.cu ADDED Viewed

	@@ -0,0 +1,723 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_ifft_cuda_kernel_64(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half x_real_shared[];
+    half *x_imag_shared = &x_real_shared[N * N];
+    half *d_f_real = &x_imag_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+            reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            if(out_gate != nullptr){
+                out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+            }
+            else{
+                out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_32(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_real_shared[32 * 64];
+    __shared__ half x_imag_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = __hneg(acc_frag_real[i][j].x[k]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+        }
+        else{
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_128(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+     const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = 8;
+    const int n = 16;
+    extern __shared__ half real_shared[];
+    half *imag_shared = &real_shared[128 * 128];
+    half *real_shared_2 = &imag_shared[128 * 128];
+    half *imag_shared_2 = &real_shared_2[128 * 128];
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag[8][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
+            real_shared_2[shared_offset] = d_f[shared_offset].real();
+            imag_shared_2[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__half2*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__half2*>(real_shared)[shared_offset] = x_real[offset + idx];
+                reinterpret_cast<__half2*>(imag_shared)[shared_offset] = x_imag[offset + idx];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements/2; k++)
+            {
+                tmp_real = __hsub2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]),
+                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]));
+                tmp_imag = __hadd2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]),
+                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]));
+                reinterpret_cast<__half2*>(b_frag_real[j].x)[k] = tmp_real;
+                reinterpret_cast<__half2*>(b_frag_imag[j].x)[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(out_gate != nullptr){
+                    out_real[offset + idx] = __hmul2(reinterpret_cast<__half2*>(real_shared)[shared_offset], out_gate[offset + idx]);
+                }
+                else{
+                    out_real[offset + idx] = reinterpret_cast<__half2*>(real_shared)[shared_offset];
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_16(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+   const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_real_shared[16 * 64];
+    __shared__ half x_imag_shared[16 * 64];
+    __shared__ half d_f_real[16 * 16];
+    __shared__ half d_f_imag[16 * 16];
+    __shared__ half twiddles_real_shared[16 * 64];
+    __shared__ half twiddles_imag_shared[16 * 64];
+    __shared__ half out_real_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    //check if it is better to have one warp do all the multiplication or split between warps
+    if (threadIdx.y < 4)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = __hneg(acc_frag_real.x[k]);
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+        }
+        else{
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+    }
+}
+torch::Tensor butterfly_ifft_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt)
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    uint N = x_real.size(2);
+    uint M = x_real.size(3);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
+    gridDim.z = H;
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_ifft_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_ifft_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_ifft_cuda_kernel_64<<<gridDim, blockDim, 8 * N * N * sizeof(half)>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536*2);
+        butterfly_ifft_cuda_kernel_128<<<gridDim, blockDim, 65536*2>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    default:
+        printf("Not implemented\n");
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,705 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_ifft_bf16_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 x_real_shared[];
+    __nv_bfloat16 *x_imag_shared = &x_real_shared[N * N];
+    __nv_bfloat16 *d_f_real_shared = &x_imag_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+            reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            if(out_gate != nullptr){
+                out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]); ;
+            }else{
+                out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = - acc_frag_real[i][j].x[k];
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
+        }else{
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 real_shared[];
+    __nv_bfloat16 *imag_shared = &real_shared[128 * 128];
+    __nv_bfloat16 *real_shared_2 = &imag_shared[128 * 128];
+    __nv_bfloat16 *imag_shared_2 = &real_shared_2[128 * 128];
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag[8][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared_2)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared_2)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = x_real[offset + idx];
+                reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = x_imag[offset + idx];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < 8; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 8; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(reinterpret_cast<float*>(real_shared) + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(out_gate != nullptr){
+                    out_real[offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]), out_gate[offset + idx]);
+                }else{
+                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]);
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
+    __shared__ float out_real_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = - acc_frag_real.x[k];
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
+        }else{
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+    }
+}
+torch::Tensor butterfly_ifft_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    uint N = x_real.size(2);
+    uint M = x_real.size(3);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+     case 16:
+        gridDim.z = H;
+        butterfly_ifft_bf16_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        gridDim.z = H;
+        butterfly_ifft_bf16_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+        butterfly_ifft_bf16_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+        butterfly_ifft_bf16_cuda_kernel_128<<<gridDim, blockDim, 65536 * 2>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    default:
+        printf("Not implemented\n");
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda.cu ADDED Viewed

	@@ -0,0 +1,871 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cmath>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int K>
+__global__ void butterfly_padded_cuda_kernel_64(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ half x_shared[];
+    half *d_f_real = &x_shared[K * 16 * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[K][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        t_offset = t * M/2;
+        out_t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+                shared_offset = i * 32 + threadIdx.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2half2_rn(0.0f, 0.0f);
+                }
+                else{
+                    reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2half2_rn(0.0f, 0.0f);
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+            }
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            out_real[out_offset + out_t_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[shared_offset];
+            out_imag[out_offset + out_t_offset + idx] = reinterpret_cast<__half2 *>(out_imag_shared)[shared_offset];
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_padded_cuda_kernel_128(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ half shared_real[];
+    half *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[K][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[8];
+    for (int i = threadIdx.y ; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = i * 128 + threadIdx.x + j * blockDim.x;
+            shared_real[shared_offset] = d_f[shared_offset].real();
+            shared_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(shared_real)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__half2*>(shared_imag)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        t_offset = t * M/2;
+        out_t_offset = t * 128 * 32 * 2 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    if(x_gate != nullptr){
+                        reinterpret_cast<__half2*>(shared_real)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2half2_rn(0.0f, 0.0f);
+                    }
+                    else{
+                        reinterpret_cast<__half2*>(shared_real)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2half2_rn(0.0f, 0.0f);
+                    }
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            __half2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(shared_real + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+                wmma::store_matrix_sync(shared_imag + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = threadIdx.y; i < N; i+=blockDim.y)
+            {
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[idx + out_offset + out_t_offset] = reinterpret_cast<__half2*>(shared_real)[shared_offset];
+                    out_imag[idx + out_offset + out_t_offset] = reinterpret_cast<__half2*>(shared_imag)[shared_offset];
+                }
+            }
+            __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_padded_cuda_kernel_32(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 32;
+    __shared__ half x_shared[K * 16 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    __shared__ half out_imag_shared[32 * 64];
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    for(int i = threadIdx.y; i<32; i+=blockDim.y){
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        if(i < K * 16){
+            if(x_gate != nullptr){
+                reinterpret_cast<__half2*>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[offset  + idx], x_gate[offset  + idx]) : __floats2half2_rn(0.0f, 0.0f);
+            }
+            else{
+                reinterpret_cast<__half2*>(x_shared)[shared_offset] = idx < max_idx ? x[offset  + idx] : __floats2half2_rn(0.0f, 0.0f);
+            }
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[K][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                if(i<K){
+                    wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                }
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]));
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+    // int idx = offset + threadIdx.y * 32 + blockIdx.x * 32 + threadIdx.x;
+    for(int i = threadIdx.y; i<32; i+=blockDim.y){
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        out_real[out_offset + idx] = reinterpret_cast<__half2*>(out_real_shared)[i * 32 + threadIdx.x];
+        out_imag[out_offset + idx] = reinterpret_cast<__half2*>(out_imag_shared)[i * 32 + threadIdx.x];
+    }
+}
+__global__ void butterfly_padded_cuda_kernel_16(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ half x_shared[N * 64];
+    __shared__ half d_f_real[N * N];
+    __shared__ half d_f_imag[N * N];
+    __shared__ half twiddles_real_shared[N * 64];
+    __shared__ half twiddles_imag_shared[N * 64];
+    __shared__ half out_real_shared[N * 64];
+    __shared__ half out_imag_shared[N * 64];
+    // #pragma unroll
+  for(int i = threadIdx.y; i<N; i+=blockDim.y){
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        if(x_gate != NULL){
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset], x_gate[idx + offset]) : __floats2half2_rn(0.0f, 0.0f);
+        }
+        else{
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset] : __floats2half2_rn(0.0f, 0.0f);
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = i * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = reinterpret_cast<__half2 *>(acc_frag_real.x)[k];
+            tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<__half2 *>(acc_frag_real.x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]));
+            reinterpret_cast<__half2 *>(acc_frag_imag.x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]));
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i<N; i+=blockDim.y)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        out_real[out_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[i * 32 + threadIdx.x];
+        out_imag[out_offset +  idx] = reinterpret_cast<__half2 *>(out_imag_shared)[i * 32 + threadIdx.x];
+    }
+}
+std::vector<torch::Tensor> butterfly_padded_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    uint N = x.size(2);
+    uint d_f_size = d_f.size(1);
+    //need to make sure that N is less that the M to which we are padding
+    assert(N <= d_f_size * M);
+    // printf("B: %d, H: %d, N: %d\n", B, H, N);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out_real = torch::empty({B, H, d_f_size * M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, d_f_size * M}, x.options());
+    gridDim.x = 512 / (32 * 1024/ M);
+    const int K = ceil(N / (1.0 * 16 * M));
+    switch(d_f_size){
+        case 16:
+            butterfly_padded_cuda_kernel_16<<<gridDim, blockDim>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+        case 32:
+            switch (K)
+            {
+                case 1:
+                    butterfly_padded_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 2:
+                    butterfly_padded_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                            static_cast<__half2 *>(x.data_ptr()),
+                            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                            static_cast<complex_half_t *>(d_f.data_ptr()),
+                            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                            static_cast<__half2 *>(out_real.data_ptr()),
+                            static_cast<__half2 *>(out_imag.data_ptr()),
+                            B,
+                            H,
+                            N);
+                    break;
+                default:
+                    printf("Invalid K, df size 32: %d\n", K);
+            }
+            break;
+        case 64:
+            gridDim.z = H / 16;
+            switch (K)
+            {
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<1><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<2><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<3><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<4><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                default:
+                    printf("Invalid K, df size 64: %d\n", K);
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ M);
+            gridDim.z = H / 16;
+            switch(K){
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<1><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<2><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<3><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<4><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 5:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<5><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 6:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<6><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 7:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<7><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 8:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<8><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                default:
+                    printf("Invalid K, df size 128: %d\n", K);
+            }
+            break;
+        default:
+            printf("Invalid d_f size: %d\n", d_f_size);
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,897 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int K>
+__global__ void butterfly_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ __nv_bfloat16 x_shared[];
+    __nv_bfloat16 *d_f_real_shared = &x_shared[K * 16 * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    float *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    float2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        t_offset = t * M/2;
+        out_t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+                shared_offset = i * 32 + threadIdx.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+                }else{
+                    reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], 0.0f);
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            out_real[out_offset + out_t_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[shared_offset]);
+            out_imag[out_offset + out_t_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[shared_offset]);
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int N  = 32;
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    __shared__ __nv_bfloat16 x_shared[K * 16 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    __shared__ float out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i<32; i+=blockDim.y)
+    {
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        if(i < K * 16){
+            if(x_gate != nullptr){
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset], x_gate[idx + offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+            }else{
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+            }
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[K][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                if(i < K){
+                    wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                }
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                 for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]);
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i<32; i+=blockDim.y)
+    {
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        out_real[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[i * 32 + threadIdx.x]);
+        out_imag[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[i * 32 + threadIdx.x]);
+    }
+}
+template <int K>
+__global__ void butterfly_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ __nv_bfloat16 shared_real[];
+    __nv_bfloat16 *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[K][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[8];
+    for (int i = threadIdx.y ; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162 *>(shared_real)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162 *>(shared_imag)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__nv_bfloat162*>(shared_imag)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        t_offset = t * M/2;
+        out_t_offset = t * 128 * 32 * 2 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    if(x_gate != nullptr){
+                        reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = idx < max_idx ?  __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+                    }else{
+                        reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+                    }
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            float2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = threadIdx.y; i < N; i+=blockDim.y)
+            {
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[idx + out_offset + out_t_offset] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+            __syncthreads();
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = threadIdx.y; i < N; i+=blockDim.y)
+            {
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    out_imag[idx + out_offset + out_t_offset] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+    }
+}
+template<int K>
+__global__ void butterfly_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ __nv_bfloat16 x_shared[N * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[N * N];
+    __shared__ __nv_bfloat16 d_f_imag_shared[N * N];
+    __shared__ __nv_bfloat16 twiddles_real_shared[N * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[N * 64];
+    __shared__ float out_real_shared[N * 64];
+    __shared__ float out_imag_shared[N * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset], x_gate[idx + offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = i * 16 + threadIdx.x;
+            d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+            d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real_shared, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag_shared, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, 0.0f);
+         wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+#pragma unroll
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real.x)[k];
+            tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<float2 *>(acc_frag_real.x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]);
+            reinterpret_cast<float2 *>(acc_frag_imag.x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]);
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;;
+        out_real[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[i * 32 + threadIdx.x]);
+        out_imag[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[i * 32 + threadIdx.x]);
+    }
+}
+std::vector<torch::Tensor> butterfly_padded_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    uint d_f_size = d_f_real.size(1);
+    uint N = x.size(2);
+    //need to make sure that N is less that the M to which we are padding
+    assert(N <= d_f_size * M);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out_real = torch::empty({B, H, d_f_size * M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, d_f_size * M}, x.options());
+    gridDim.x = 512 / (32 * 1024/ M);
+    const int K = ceil(N / (1.0 * 16 * M));
+    switch (d_f_size)
+    {
+        case 16:
+            butterfly_cuda_kernel_16<1><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+        case 32:
+            switch(K){
+                case 1:
+                    butterfly_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 2:
+                    butterfly_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                default:
+                    printf("Invalid K, df size 32: %d\n", K);
+            }
+            break;
+        case 64:
+            gridDim.z = H / 16;
+            switch(K){
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<1><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<2><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<3><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<4><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                default:
+                    printf("Invalid K, df size 64: %d\n", K);
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ M);
+            gridDim.z = H / 16;
+            switch(K){
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<1><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                        break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<2><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<3><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<4><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 5:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<5><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 6:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<6><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 7:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<7><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 8:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<8><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                default:
+                    printf("Invalid K, df size 128: %d\n", K);
+            }
+            break;
+        default:
+        printf("Not yet implemented \n");
+            break;
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda.cu ADDED Viewed

	@@ -0,0 +1,905 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_64(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H * M/2;
+    const int in_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * TILE_H * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ half x_real_shared[];
+    half *x_imag_shared = &x_real_shared[N * N];
+    half *d_f_real = &x_imag_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[K][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[K][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[K];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        if(i < K){
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+            }
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+            reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            if(idx < max_idx){
+                if(out_gate != nullptr)
+                    out_real[out_offset + out_t_offset + idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[shared_offset], out_gate[out_offset + out_t_offset + idx]);
+                else
+                    out_real[out_offset + out_t_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[shared_offset];
+            }
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_32(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 32;
+    int idx;
+    int shared_offset;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int in_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    __shared__ half x_real_shared[32 * 64];
+    __shared__ half x_imag_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[in_offset  + idx];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[in_offset  + idx];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N/16)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[K][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[K][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[K][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                if(i < K){
+                    wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                    wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                }
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = __hneg(acc_frag_real[i][j].x[k]);
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[idx +  out_offset] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[shared_offset], out_gate[idx +  out_offset]);
+            }else{
+                out_real[idx +  out_offset] = reinterpret_cast<__half2 *>(out_real_shared)[shared_offset];
+            }
+        }
+    }
+}
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_128(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H *  M/2;
+    const int in_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * TILE_H * 128 * 32 *  2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ half real_shared[];
+    half *imag_shared = &real_shared[128 * 128];
+    half *real_shared_2 = &imag_shared[128 * 128];
+    half *imag_shared_2 = &real_shared_2[128 * 128];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag[K][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[K];
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = i * 128 + threadIdx.x + j * blockDim.x;
+            real_shared_2[shared_offset] = d_f[shared_offset].real();
+            imag_shared_2[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(real_shared)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__half2*>(imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 128 * 32 * 2  * gridDim.x;
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__half2*>(real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+                reinterpret_cast<__half2*>(imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                if(idx < max_idx){
+                    if(out_gate != nullptr){
+                        out_real[idx + out_offset + out_t_offset] = __hmul2(reinterpret_cast<__half2*>(real_shared)[shared_offset], out_gate[idx + out_offset + out_t_offset]);
+                    }else{
+                        out_real[idx + out_offset + out_t_offset] = reinterpret_cast<__half2*>(real_shared)[shared_offset];
+                    }
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_padded_cuda_kernel_16(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ half x_real_shared[N * 64];
+    __shared__ half x_imag_shared[N * 64];
+    __shared__ half d_f_real[N * N];
+    __shared__ half d_f_imag[N * N];
+    __shared__ half twiddles_real_shared[N * 64];
+    __shared__ half twiddles_imag_shared[N * 64];
+    __shared__ half out_real_shared[N * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        if(threadIdx.x  < 16 ){
+            shared_offset = i * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    //check if it is better to have one warp do all the multiplication or split between warps
+    if (threadIdx.y < 4)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = __hneg(acc_frag_real.x[k]);
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[out_offset + idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[i * 32 + threadIdx.x], out_gate[out_offset + idx]);
+            }
+            else{
+                out_real[out_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[i * 32 + threadIdx.x];
+            }
+        }
+    }
+}
+torch::Tensor butterfly_ifft_padded_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int fft_size,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    uint N_M = x_real.size(2);
+    const int d_f_size = d_f.size(0);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    // uint N = x_real.size(2);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    gridDim.x = 512 / (32 * 1024/ (N_M / d_f_size));
+    gridDim.z = H;
+    const int TILE_H = 16;
+    torch::Tensor out_real = torch::empty({B, H, fft_size}, x_real.options());
+    const int K = ceil(fft_size / (1.0 * 16 * (N_M / d_f_size)));
+    switch(d_f_size){
+        case 16:
+            butterfly_ifft_padded_cuda_kernel_16<<<gridDim, blockDim>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+            break;
+        case 32:
+            switch (K)
+            {
+            case 1:
+                butterfly_ifft_padded_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            case 2:
+                butterfly_ifft_padded_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        case 64:
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                break;
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ (N_M / d_f_size));
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 5:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 6:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 7:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 8:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        default:
+            printf("Invalid d_f_size: %d\n", d_f_size);
+            break;
+    }
+    return out_real;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,917 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H * M/2;
+    const int in_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * TILE_H * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ __nv_bfloat16 x_real_shared[];
+    __nv_bfloat16 *x_imag_shared = &x_real_shared[N * N];
+    __nv_bfloat16 *d_f_real_shared = &x_imag_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[K][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[K][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[K];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] =  d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] =  d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        if(i < K){
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+            }
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+            reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            if(idx < max_idx){
+                if(out_gate != nullptr)
+                    out_real[out_offset + out_t_offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]), out_gate[out_offset + out_t_offset + idx]);
+                else
+                    out_real[out_offset + out_t_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]);
+            }
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 32;
+    int idx;
+    int shared_offset;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int in_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    __shared__ __nv_bfloat16 x_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[32 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[in_offset  + idx];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[in_offset  + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N/16)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[K][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[K][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[K][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                if(i < K){
+                    wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                    wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                }
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = - acc_frag_real[i][j].x[k];
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[idx +  out_offset] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]), out_gate[idx +  out_offset]);
+            }else{
+                out_real[idx +  out_offset] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]);
+            }
+        }
+    }
+}
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162  *__restrict__ d_f_real,
+    const __nv_bfloat162  *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H *  M/2;
+    const int in_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * TILE_H * 128 * 32 *  2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ __nv_bfloat16 real_shared[];
+    __nv_bfloat16 *imag_shared = &real_shared[128 * 128];
+    __nv_bfloat16 *real_shared_2 = &imag_shared[128 * 128];
+    __nv_bfloat16 *imag_shared_2 = &real_shared_2[128 * 128];
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag[K][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[K];
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared_2)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared_2)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 128 * 32 * 2  * gridDim.x;
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+                reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        __syncthreads();
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = -acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(reinterpret_cast<float*>(real_shared) + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                if(idx < max_idx){
+                    if(out_gate != nullptr){
+                        out_real[idx + out_offset + out_t_offset] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]), out_gate[idx + out_offset + out_t_offset]);
+                    }else{
+                        out_real[idx + out_offset + out_t_offset] = __float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]);
+                    }
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_padded_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ __nv_bfloat16 x_real_shared[N * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[N * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[N * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[N * 64];
+    __shared__ float out_real_shared[N * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = - acc_frag_real.x[k];
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+       int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[out_offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[i * 32 + threadIdx.x]), out_gate[out_offset + idx]);
+            }else{
+                out_real[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[i * 32 + threadIdx.x]);
+            }
+        }
+    }
+}
+torch::Tensor butterfly_ifft_padded_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int fft_size,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    uint N_M = x_real.size(2);
+    const int d_f_size = d_f_real.size(0);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    // uint N = x_real.size(2);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    gridDim.x = 512 / (32 * 1024/ (N_M / d_f_size));
+    gridDim.z = H;
+    const int TILE_H = 16;
+    torch::Tensor out_real = torch::empty({B, H, fft_size}, x_real.options());
+    const int K = ceil(fft_size / (1.0 * 16 * (N_M / d_f_size)));
+    switch(d_f_size){
+        case 16:
+            butterfly_ifft_padded_cuda_kernel_16<<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+        case 32:
+            switch (K)
+            {
+            case 1:
+                butterfly_ifft_padded_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            case 2:
+                butterfly_ifft_padded_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        case 64:
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                break;
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ (N_M / d_f_size));
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 5:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 6:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 7:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 8:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        default:
+            printf("Invalid d_f_size: %d\n", d_f_size);
+            break;
+    }
+    return out_real;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/shared.h ADDED Viewed

	@@ -0,0 +1,60 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+using namespace nvcuda;
+using complex_half_t = typename c10::complex<at::Half>;
+using complex_bhalf_t = typename c10::complex<at::BFloat16>;
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+#define WARP_SIZE 32
+#ifndef MONARCH_CUDA_H_
+#define MONARCH_CUDA_H_
+__device__ __forceinline__ float2
+operator+( float2 lhs, float2 rhs)
+{
+    float2 res = { lhs.x + rhs.x , lhs.y + rhs.y };
+    return res;
+}
+__device__ __forceinline__ float2
+operator-( float2 lhs, float2 rhs)
+{
+    float2 res = { lhs.x - rhs.x , lhs.y - rhs.y };
+    return res;
+}
+__device__ __forceinline__ float2
+operator*( float2 lhs, float2 rhs)
+{
+    float2 res = { lhs.x * rhs.x , lhs.y * rhs.y };
+    return res;
+}
+#endif

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d.h ADDED Viewed

	@@ -0,0 +1,96 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_IS_HALF_OR_BFLOAT_OR_FLOAT(x) TORCH_CHECK(x.dtype() == torch::kFloat16 || x.dtype() == torch::kBFloat16 || x.dtype() == torch::kFloat32, #x " must be float16 or bfloat16 or float32")
+#define CHECK_SAME_TYPE(x, y) TORCH_CHECK(x.dtype() == y.dtype(), #x " and " #y " must have the same dtype")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_IS_HALF_OR_BFLOAT_OR_FLOAT(x)
+torch::Tensor conv1d_cuda_bhl(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding);
+torch::Tensor conv1d_cuda_blh(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding);
+std::vector<torch::Tensor> conv1d_backward_bhl_cuda(
+    torch::Tensor dout,
+    torch::Tensor input,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding
+);
+std::vector<torch::Tensor> conv1d_backward_blh_cuda(
+    torch::Tensor dout,
+    torch::Tensor input,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding
+);
+torch::Tensor conv1d_fwd(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding,
+    bool is_bhl)
+{
+    CHECK_INPUT(u);
+    CHECK_INPUT(weight);
+    CHECK_INPUT(bias);
+    CHECK_SAME_TYPE(weight, bias);
+    int k;
+    if(is_bhl){
+        k = weight.size(1);
+    }else{
+        k = weight.size(0);
+    }
+    TORCH_CHECK(k % 2 == 1, "Filter size must be odd number");
+    if(is_bhl){
+        return conv1d_cuda_bhl(u, weight, bias, padding);
+    }else{
+        return conv1d_cuda_blh(u, weight, bias, padding);
+    }
+}
+std::vector<torch::Tensor> conv1d_bwd(
+    torch::Tensor dout,
+    torch::Tensor input,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding,
+    bool is_bhl)
+{
+    CHECK_INPUT(dout);
+    CHECK_INPUT(input);
+    CHECK_INPUT(weight);
+    CHECK_INPUT(bias);
+    CHECK_SAME_TYPE(weight, bias);
+    CHECK_SAME_TYPE(dout, input);
+    if(is_bhl){
+        return conv1d_backward_bhl_cuda(dout, input, weight, bias, padding);
+    } else{
+        return conv1d_backward_blh_cuda(dout, input, weight, bias, padding);
+    }
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bhl.cu ADDED Viewed

	@@ -0,0 +1,132 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+// Simple 1D depthwise convolution implementation with dilation and stride = 1
+#include "shared.h"
+const uint BX = 256;
+const uint BY = 1;
+const uint BZ = 1;
+const uint TILE_SIZE_L = 4;
+const uint TILE_SIZE_D = 1;
+template<typename T, typename U>
+__forceinline__ __device__ T _conv1d_k_3(const T* u, const U* weights, const U* bias, uint padding, uint l, uint d, uint L, uint D, uint K)
+{
+    T tmp;
+    T weight;
+    set_value(&tmp, bias[d]);
+    int idx = l - padding;
+    if(idx >= 0 && idx < L){
+        set_value(&weight, weights[0]);
+        tmp = __hfma(u[d * L + idx], weight, tmp);
+    }
+    idx++;
+    if(idx >= 0 && idx < L){
+        set_value(&weight, weights[1]);
+        tmp = __hfma(u[d * L + idx], weight, tmp);
+    }
+    idx++;
+    if(idx >= 0 && idx < L){
+        set_value(&weight, weights[2]);
+        tmp = __hfma(u[d * L + idx], weight, tmp);
+    }
+    return tmp;
+}
+template<typename T, typename U>
+__global__ void conv1d_kernel(
+    const T *__restrict__ u,
+    const U *__restrict__ weights,
+    const U *__restrict__ bias,
+    T *__restrict__ out,
+    uint padding,
+    uint B,
+    uint L,
+    uint D,
+    uint K,
+    uint L_out
+    )
+{
+    const int b = blockIdx.z * blockDim.z + threadIdx.z;
+    const int d = blockIdx.y * blockDim.y * TILE_SIZE_D + threadIdx.y;
+    const int l_offset = blockIdx.x * blockDim.x * TILE_SIZE_L + threadIdx.x;
+    T tmp;
+    T weight;
+    int idx;
+    int l;
+    for(int l_tile = 0; l_tile < TILE_SIZE_L; l_tile++){
+        l = l_offset + l_tile * blockDim.x;
+        set_value(&tmp, bias[d]);
+        if(d < D && l < L_out && b < B){
+            if(K == 3){
+                out[b * L_out * D + d * L_out + l] = _conv1d_k_3(u + b * L * D, weights + d * K, bias, padding, l, d, L, D, K);
+            } else{
+                for(int k = 0; k < K; k++){
+                    idx = l - padding + k;
+                    if(idx >= 0 && idx < L){
+                        set_value(&weight, weights[d * K + k]);
+                        tmp = __hfma(u[b * L_out * D + d * L + idx], weight, tmp);
+                    }
+                }
+                out[b * L_out * D + d * L_out + l] = tmp;
+            }
+        }
+    }
+}
+torch::Tensor conv1d_cuda_bhl(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint d = u.size(1);
+    const uint l = u.size(2);
+    const uint k = weight.size(1);
+    uint l_out = (l + 2 * padding - k + 1);
+    dim3 blockDims(BX, BY, BZ);
+    dim3 gridDims(ceil(l_out * 1.0 / (BX * TILE_SIZE_L) ), ceil((d * 1.0) / (BY * TILE_SIZE_D)), ceil((b * 1.0) / BZ));
+    torch::Tensor out = torch::empty({b, d, l_out}, u.options());
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d fwd bhl",
+        ([&]
+            { conv1d_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<weight_t *>(bias.data_ptr()),
+                    static_cast<input_t *>(out.data_ptr()),
+                    padding,
+                    b,
+                    l,
+                    d,
+                    k,
+                    l_out
+                    );
+            }
+        )
+    );
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_blh.cu ADDED Viewed

	@@ -0,0 +1,202 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+// Simple 1D depthwise convolution implementation with dilation and stride = 1
+#include "shared.h"
+//For max perf, tune for your GPU and batch size, and datatype etc
+const uint BX = 512;
+const uint BY = 1;
+const uint BZ = 1;
+const uint TILE_SIZE_Y = 4;
+const uint TILE_SIZE_X = 2;
+// Trick to do padding in place without actually creating a new tensor
+__forceinline__ __device__ __half2 get_u(const __half2 *__restrict__ u, uint L_eff, uint l, uint p, uint b, uint k, uint d, uint L, uint D, uint K)
+{
+    return l + k < p || l + k > L_eff - (p + 1) ? __float2half2_rn(0.0f) : u[b * L * D + (l + k - p) * D + d];
+}
+__forceinline__ __device__ __nv_bfloat162 get_u(const __nv_bfloat162 *__restrict__ u, uint L_eff, uint l, uint p, uint b, uint k, uint d, uint L, uint D, uint K)
+{
+    return l + k < p || l + k > L_eff - (p + 1) ? __float2bfloat162_rn(0.0f) : u[b * L * D + (l + k - p) * D + d];
+}
+__forceinline__ __device__ float2 get_u(const float2 *__restrict__ u, uint L_eff, uint l, uint p, uint b, uint k, uint d, uint L, uint D, uint K)
+{
+    return l + k < p || l + k > L_eff - (p + 1) ? make_float2(0.0f, 0.0f) : u[b * L * D + (l + k - p) * D + d];
+}
+//manually unrolling loop for k = 3 leads to good perf, can easily extend for other values of k if need be
+template<typename T, typename U>
+__forceinline__ __device__ T _conv1d_k_3(const T* u, const U* weights, const U* bias, T* out, uint padding, uint b, uint l, uint d, uint t, uint L, uint D, uint K, uint L_eff, uint L_out)
+{
+    T tmp;
+    T weight;
+    set_value(&tmp, bias[d]);
+    set_value(&weight, weights[0 * D + d]);
+    tmp = __hfma2(get_u(u, L_eff, l + t, padding, b, 0, d, L, D, K), weight, tmp);
+    set_value(&weight, weights[1 * D + d]);
+    tmp = __hfma2(get_u(u, L_eff, l + t, padding, b, 1, d, L, D, K), weight, tmp);
+    set_value(&weight, weights[2 * D + d]);
+    out[b * D * L_out  + (l + t) * D + d] = __hfma2(get_u(u, L_eff, l + t, padding, b, 2, d, L, D, K), weight, tmp);
+}
+template<typename T, typename U>
+__global__ void conv1d_kernel_k_3(
+    const T *__restrict__ u,
+    const U *__restrict__ weights,
+    const U *__restrict__ bias,
+    T *__restrict__ out,
+    uint padding,
+    uint B,
+    uint L,
+    uint L_out,
+    uint L_eff,
+    uint D,
+    uint K)
+{
+    const int d_block = blockIdx.x * blockDim.x * TILE_SIZE_X;
+    const int l = blockIdx.y * blockDim.y * TILE_SIZE_Y + threadIdx.y * TILE_SIZE_Y;
+    const int b = blockIdx.z * blockDim.z + threadIdx.z;
+    int d;
+    #pragma unroll
+    for (int i = 0; i < TILE_SIZE_X; i++)
+    {
+        d = d_block + threadIdx.x + i * BX;
+        if (d < D && b < B){
+            #pragma unroll
+            for (int t = 0; t < TILE_SIZE_Y; t++){
+                if (l + t < L_eff - K + 1)
+                {
+                    _conv1d_k_3(u, weights, bias, out, padding, b, l, d, t, L, D, K, L_eff, L_out);
+                }
+            }
+        }
+    }
+}
+template<typename T, typename U>
+__global__ void conv1d_kernel(
+    const T *__restrict__ u,
+    const U *__restrict__ weights,
+    const U *__restrict__ bias,
+    T *__restrict__ out,
+    uint padding,
+    uint B,
+    uint L,
+    uint L_out,
+    uint L_eff,
+    uint D,
+    uint K)
+{
+    const int d_block = blockIdx.x * blockDim.x * TILE_SIZE_X;
+    const int l = blockIdx.y * blockDim.y * TILE_SIZE_Y + threadIdx.y * TILE_SIZE_Y;
+    const int b = blockIdx.z * blockDim.z + threadIdx.z;
+    int d;
+    T tmp;
+    T weight;
+    #pragma unroll
+        for (int i = 0; i < TILE_SIZE_X; i++)
+        {
+            d = d_block + threadIdx.x + i * BX;
+            if (d < D && b < B){
+                #pragma unroll
+                for (int t = 0; t < TILE_SIZE_Y; t++){
+                    if (l + t < L_eff - K + 1)
+                    {
+                        set_value(&tmp, bias[d]);
+                        for(int k = 0; k < K; k++){
+                            set_value(&weight, weights[k * D + d]);
+                            tmp = __hfma2(get_u(u, L_eff, l + t, padding, b, k, d, L, D, K), weight, tmp);
+                        }
+                            out[b * D * L_out  + (l + t) * D + d] = tmp;
+                    }
+                }
+            }
+        }
+}
+torch::Tensor conv1d_cuda_blh(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint l = u.size(1);
+    const uint d = u.size(2);
+    const uint k = weight.size(0);
+    uint l_eff = l + 2 * padding;
+    dim3 blockDims(BX, BY, BZ);
+    dim3 gridDims(ceil(d * 1.0 / (BX * TILE_SIZE_X * 2) ), ceil((l_eff - k + 1) * 1.0 / (BY * TILE_SIZE_Y)), ceil(b * 1.0 / BZ));
+    uint l_out = (l + 2 * padding - k + 1);
+    torch::Tensor out = torch::empty({b, l_out, d}, u.options());
+    //calling seperate kernels for k=3 and k!=3 leads to better perf
+    if(k==3){
+         DISPATCH_FLOAT2_AND_HALF2_AND_BF162(u.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d fwd blh",
+        ([&]
+            { conv1d_kernel_k_3<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<weight_t *>(bias.data_ptr()),
+                    static_cast<input_t *>(out.data_ptr()),
+                    padding,
+                    b,
+                    l,
+                    l_out,
+                    l_eff,
+                    ceil(d/2),
+                    k);
+            }
+        )
+    );
+    }else{
+       DISPATCH_FLOAT2_AND_HALF2_AND_BF162(u.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d fwd blh",
+        ([&]
+            { conv1d_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<weight_t *>(bias.data_ptr()),
+                    static_cast<input_t *>(out.data_ptr()),
+                    padding,
+                    b,
+                    l,
+                    l_out,
+                    l_eff,
+                    ceil(d/2),
+                    k);
+            }
+        )
+    );
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bwd_cuda_bhl.cu ADDED Viewed

	@@ -0,0 +1,106 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include "shared.h"
+const uint BX = 128;
+const uint BY = 1;
+const uint BZ = 1;
+const uint TILE_SIZE = 4;
+template <typename input_t, typename weight_t>
+__global__ void conv1d_backward_kernel(
+    const input_t* __restrict__ dout,
+    const input_t* __restrict__ u,
+    const weight_t* __restrict__ weights,
+    input_t* __restrict__ du,
+    input_t* __restrict__ dk,
+    uint B,
+    uint L,
+    uint D,
+    uint K,
+    uint P
+    )
+{
+    const int b = blockIdx.z;
+    const int d = blockIdx.y;
+    const int l = blockIdx.x;
+    //construct the du matrix
+    if(b < B && d < D && l == 0){
+        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        {
+            input_t sum;
+            set_value(&sum, 0.0f);
+            input_t weight;
+            for(int k = 0; k < K ; k++)
+            {
+                int idx = - P + k + j;
+                if(idx >= 0 && idx < L){
+                    set_value(&weight, weights[d * K + K - (k +1)]);
+                    sum = __hfma(dout[b * D * L + d * L + idx], weight, sum);
+                }
+            }
+            du[b * D * L + d * L + j] = sum;
+        }
+    }
+    const int k = blockIdx.x;
+    input_t tmp;
+    //construct the dk matrix
+    if(b < B && d < D && k < K)
+    {
+        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        {
+            if(k - P + j < 0 || k - P + j >= L){
+                set_value(&dk[b * D * K * L + d * K * L + k * L + j], 0.0f);
+            }else{
+                set_value(&dk[b * D * K * L + d * K * L + k * L + j], u[b * D * L + d * L + k - P + j]);
+            }
+        }
+    }
+}
+std::vector<torch::Tensor> conv1d_backward_bhl_cuda(
+    torch::Tensor dout,
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint d = u.size(1);
+    const uint l = u.size(2);
+    const uint k = weight.squeeze().size(1);
+    dim3 blockDims(BX, 1, 1);
+    dim3 gridDims(l, d, b);
+    torch::Tensor du = torch::empty({b, d, l}, u.options());
+    torch::Tensor dk = torch::empty({b, d, k, l}, dout.options());
+    torch::Tensor dbias = dout.sum(-1).sum(0);
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(dout.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d backward bhl",
+        ([&]
+            { conv1d_backward_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(dout.data_ptr()),
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<input_t *>(du.data_ptr()),
+                    static_cast<input_t *>(dk.data_ptr()),
+                    b,
+                    l,
+                    d,
+                    k,
+                    padding);
+            }
+        )
+    );
+    return {du, torch::matmul(dk, dout.unsqueeze(-1)).squeeze(-1).sum(0).to(weight.type()), dbias};
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bwd_cuda_blh.cu ADDED Viewed

	@@ -0,0 +1,116 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include "shared.h"
+const uint BX = 128;
+const uint BY = 1;
+const uint BZ = 1;
+template <typename input_t, typename weight_t>
+__global__ void conv1d_backward_kernel(
+    const input_t* __restrict__ dout,
+    int dout_stride0,
+    int dout_stride1,
+    int dout_stride2,
+    const input_t* __restrict__ u,
+    const weight_t* __restrict__ weights,
+    int weights_stride0,
+    int weights_stride1,
+    input_t* __restrict__ du,
+    input_t* __restrict__ dk,
+    uint B,
+    uint L,
+    uint D,
+    uint K,
+    uint P
+    )
+{
+    const int b = blockIdx.z;
+    const int d = blockIdx.y;
+    const int l = blockIdx.x;
+    //construct the du matrix
+    if(b < B && d < D && l == 0){
+        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        {
+            input_t sum;
+            set_value(&sum, 0.0f);
+            input_t weight;
+            for(int k = 0; k < K ; k++)
+            {
+                int idx = - P + k + j;
+                if(idx >= 0 && idx < L){
+                    set_value(&weight, weights[d * weights_stride1 + (K - (k +1)) * weights_stride0]);
+                    sum = __hfma(dout[b * dout_stride0 + d * dout_stride1 + idx * dout_stride2], weight, sum);
+                }
+            }
+            du[b * D * L + j * D + d] = sum;
+        }
+    }
+    const int k = blockIdx.x;
+    //construct the dk matrix
+    if(b < B && d < D && k < K)
+    {
+        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        {
+            if(k - P + j < 0 || k - P + j >= L){
+                set_value(&dk[b * D * K * L + d * K * L + k * L + j], 0.0f);
+            }else{
+                set_value(&dk[b * D * K * L + d * K * L + k * L + j], u[b * D * L + (k - P + j) * D + d]);
+            }
+        }
+    }
+}
+std::vector<torch::Tensor> conv1d_backward_blh_cuda(
+    torch::Tensor dout,
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint l = u.size(1);
+    const uint d = u.size(2);
+    const uint k = weight.squeeze().size(0);
+    dim3 blockDims(BX, 1, 1);
+    dim3 gridDims(l, d, b);
+    torch::Tensor du = torch::empty({b, l, d}, u.options());
+    torch::Tensor dk = torch::empty({b, d, k, l}, u.options());
+    torch::Tensor dbias = dout.sum(-2).sum(0);
+    dout = dout.transpose(-1,-2);
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(dout.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d backward blh",
+        ([&]
+            { conv1d_backward_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(dout.data_ptr()),
+                    dout.stride(0),
+                    dout.stride(1),
+                    dout.stride(2),
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    weight.stride(0),
+                    weight.stride(1),
+                    static_cast<input_t *>(du.data_ptr()),
+                    static_cast<input_t *>(dk.data_ptr()),
+                    b,
+                    l,
+                    d,
+                    k,
+                    padding);
+            }
+        )
+    );
+    return {du, torch::matmul(dk, dout.unsqueeze(-1)).squeeze(-1).sum(0).view({k, d}).to(weight.dtype()), dbias};
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/shared.h ADDED Viewed

	@@ -0,0 +1,168 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <vector>
+#define DISPATCH_FLOAT_AND_HALF_AND_BF16(INPUT_TYPE, WEIGHT_TYPE, NAME, ...)                     \
+  if ((INPUT_TYPE == at::ScalarType::Half) && (WEIGHT_TYPE == at::ScalarType::Half)) {           \
+    using input_t = __half;                                                            \
+    using weight_t = __half;                                                           \
+    __VA_ARGS__();                                                                       \
+  } else if((INPUT_TYPE == at::ScalarType::Half) && (WEIGHT_TYPE == at::ScalarType::BFloat16)){    \
+    using input_t = __half;                                                            \
+    using weight_t = __nv_bfloat16;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if((INPUT_TYPE == at::ScalarType::Half) && (WEIGHT_TYPE == at::ScalarType::Float)){   \
+    using input_t = __half;                                                            \
+    using weight_t = float;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::BFloat16) && (WEIGHT_TYPE == at::ScalarType::BFloat16)) {    \
+    using input_t = __nv_bfloat16;                                                        \
+    using weight_t = __nv_bfloat16;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::BFloat16) && (WEIGHT_TYPE == at::ScalarType::Half)) {      \
+    using input_t = __nv_bfloat16;                                                        \
+    using weight_t = __half;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::BFloat16) && (WEIGHT_TYPE == at::ScalarType::Float)) {    \
+    using input_t = __nv_bfloat16;                                                        \
+    using weight_t = float;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::Float) && (WEIGHT_TYPE == at::ScalarType::Float))  { \
+    using input_t = float;                                                               \
+    using weight_t = float;                                                              \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::Float) && (WEIGHT_TYPE == at::ScalarType::Half))  {  \
+    using input_t = float;                                                               \
+    using weight_t = __half;                                                           \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::Float) && (WEIGHT_TYPE == at::ScalarType::BFloat16))  {  \
+    using input_t = float;                                                               \
+    using weight_t = __nv_bfloat16;                                                           \
+    __VA_ARGS__();                                                                       \
+  } else {                                                                               \
+    AT_ERROR(#NAME, " not implemented for input-type '", toString(INPUT_TYPE), "' and weight-type '", toString(WEIGHT_TYPE), "'"); \
+  }
+#define DISPATCH_FLOAT2_AND_HALF2_AND_BF162(INPUT_TYPE, WEIGHT_TYPE, NAME, ...)                     \
+  if ((INPUT_TYPE == at::ScalarType::Half) && (WEIGHT_TYPE == at::ScalarType::Half)) {           \
+    using input_t = __half2;                                                            \
+    using weight_t = __half2;                                                           \
+    __VA_ARGS__();                                                                       \
+  } else if((INPUT_TYPE == at::ScalarType::Half) && (WEIGHT_TYPE == at::ScalarType::BFloat16)){    \
+    using input_t = __half2;                                                            \
+    using weight_t = __nv_bfloat162;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if((INPUT_TYPE == at::ScalarType::Half) && (WEIGHT_TYPE == at::ScalarType::Float)){   \
+    using input_t = __half2;                                                            \
+    using weight_t = float2;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::BFloat16) && (WEIGHT_TYPE == at::ScalarType::BFloat16)) {    \
+    using input_t = __nv_bfloat162;                                                        \
+    using weight_t = __nv_bfloat162;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::BFloat16) && (WEIGHT_TYPE == at::ScalarType::Half)) {      \
+    using input_t = __nv_bfloat162;                                                        \
+    using weight_t = __half2;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::BFloat16) && (WEIGHT_TYPE == at::ScalarType::Float)) {    \
+    using input_t = __nv_bfloat162;                                                        \
+    using weight_t = float2;                                                       \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::Float) && (WEIGHT_TYPE == at::ScalarType::Float))  { \
+    using input_t = float2;                                                               \
+    using weight_t = float2;                                                              \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::Float) && (WEIGHT_TYPE == at::ScalarType::Half))  {  \
+    using input_t = float2;                                                               \
+    using weight_t = __half2;                                                           \
+    __VA_ARGS__();                                                                       \
+  } else if ((INPUT_TYPE == at::ScalarType::Float) && (WEIGHT_TYPE == at::ScalarType::BFloat16))  {  \
+    using input_t = float2;                                                               \
+    using weight_t = __nv_bfloat162;                                                           \
+    __VA_ARGS__();                                                                       \
+  } else {                                                                               \
+    AT_ERROR(#NAME, " not implemented for input-type '", toString(INPUT_TYPE), "' and weight-type '", toString(WEIGHT_TYPE), "'"); \
+  }
+__forceinline__ __device__ float __hfma(const float a, const float b, const float c)
+{
+    return a * b + c;
+}
+__forceinline__ __device__ float2 __hfma2(const float2 a, const float2 b, const float2 c)
+{
+    return make_float2(a.x * b.x + c.x, a.y * b.y + c.y);
+}
+template<typename T>
+__forceinline__ __device__ void set_value(T* dst, T src)
+{
+    *dst = src;
+}
+__forceinline__ __device__ void set_value(__half2* dst, float2 src)
+{
+    *dst = __float22half2_rn(src);
+}
+__forceinline__ __device__ void set_value(__nv_bfloat162* dst, float2 src)
+{
+    *dst = __float22bfloat162_rn(src);
+}
+__forceinline__ __device__ void set_value(float2* dst, __half2 src)
+{
+    *dst = __half22float2(src);
+}
+__forceinline__ __device__ void set_value(float2* dst, __nv_bfloat162 src)
+{
+    *dst = __bfloat1622float2(src);
+}
+__forceinline__ __device__ void set_value(__half2* dst, __nv_bfloat162 src)
+{
+    *dst = __float22half2_rn(__bfloat1622float2(src));
+}
+__forceinline__ __device__ void set_value(__nv_bfloat162* dst, __half2 src)
+{
+    *dst = __float22bfloat162_rn(__half22float2(src));
+}
+__forceinline__ __device__ void set_value(__half* dst, float src)
+{
+    *dst = __float2half(src);
+}
+__forceinline__ __device__ void set_value(__nv_bfloat16* dst, float src)
+{
+    *dst = __float2bfloat16(src);
+}
+__forceinline__ __device__ void set_value(float* dst, __half src)
+{
+    *dst = __half2float(src);
+}
+__forceinline__ __device__ void set_value(float* dst, __nv_bfloat16 src)
+{
+    *dst = __bfloat162float(src);
+}
+__forceinline__ __device__ void set_value(__half* dst, __nv_bfloat16 src)
+{
+    *dst = __float2half(__bfloat162float(src));
+}
+__forceinline__ __device__ void set_value(__nv_bfloat16* dst, __half src)
+{
+    *dst = __float2bfloat16(__half2float(src));
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch.cpp ADDED Viewed

	@@ -0,0 +1,61 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include "monarch_cuda/monarch_fwd.h"
+#include "monarch_cuda/monarch_fwd_complex.h"
+#include "monarch_cuda/monarch_fwd_r2r.h"
+#include "monarch_cuda/monarch_bwd.h"
+#include "monarch_cuda/monarch_bwd_complex.h"
+#include "monarch_cuda/monarch_bwd_r2r.h"
+#include "butterfly/butterfly.h"
+#include "conv1d/conv1d.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("monarch_conv_forward", &monarch_conv, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_16_16_16", &monarch_conv_16_16_16, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_32_16_16", &monarch_conv_32_16_16, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_16_32_32", &monarch_conv_16_32_32, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_32_32_32", &monarch_conv_32_32_32, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_16_16_16_complex", &monarch_conv_16_16_16_complex, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_32_16_16_complex", &monarch_conv_32_16_16_complex, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_16_32_32_complex", &monarch_conv_16_32_32_complex, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_32_32_32_complex", &monarch_conv_32_32_32_complex, "Monarch forward (CUDA)");
+    m.def("monarch_conv_forward_32_32_32_complex_truncated", &monarch_conv_32_32_32_complex_truncated, "Monarch forward (CUDA)");
+    m.def("monarch_conv_backward", &monarch_conv_bwd, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_16_16_16", &monarch_conv_bwd_16_16_16, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_32_16_16", &monarch_conv_bwd_32_16_16, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_16_32_32", &monarch_conv_bwd_16_32_32, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_32_32_32", &monarch_conv_bwd_32_32_32, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_16_16_16_complex", &monarch_conv_bwd_16_16_16_complex, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_32_16_16_complex", &monarch_conv_bwd_32_16_16_complex, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_16_32_32_complex", &monarch_conv_bwd_16_32_32_complex, "Monarch backward (CUDA)");
+    m.def("monarch_conv_backward_32_32_32_complex", &monarch_conv_bwd_32_32_32_complex, "Monarch backward (CUDA)");
+    m.def("monarch_conv_forward_r2r", &monarch_conv_r2r, "Monarch forward (CUDA)");
+    m.def("monarch_conv_backward_r2r", &monarch_conv_bwd_r2r, "Monarch backward (CUDA)");
+    // butterfly kernels
+    m.def("butterfly_forward", &butterfly, "Butterfly forward (CUDA)");
+    m.def("butterfly_gated_forward", &butterfly_gated, "Butterfly gated forward (CUDA)");
+    m.def("butterfly_bf16_forward", &butterfly_bf16, "Butterfly forward bf16 (CUDA)");
+    m.def("butterfly_gated_bf16_forward", &butterfly_gated_bf16, "Butterfly gated forward bf16 (CUDA)");
+    m.def("butterfly_padded_forward", &butterfly_padded, "Butterfly padded (CUDA)");
+    m.def("butterfly_padded_bf16_forward", &butterfly_padded_bf16, "Butterfly padded (CUDA)");
+    m.def("butterfly_padded_gated_forward", &butterfly_padded_gated, "Butterfly padded (CUDA)");
+    m.def("butterfly_padded_gated_bf16_forward", &butterfly_padded_gated_bf16, "Butterfly padded (CUDA)");
+    m.def("butterfly_ifft_forward", &butterfly_ifft, "Butterfly ifft forard (CUDA)");
+    m.def("butterfly_ifft_gated_forward", &butterfly_ifft_gated, "Butterfly ifft gated forard (CUDA)");
+    m.def("butterfly_ifft_gated_bf16_forward", &butterfly_ifft_gated_bf16, "Butterfly ifft gated bf16 forard (CUDA)");
+    m.def("butterfly_ifft_bf16_forward", &butterfly_ifft_bf16, "Butterfly ifft forward bf16 (CUDA)");
+    m.def("butterfly_ifft_padded_forward", &butterfly_ifft_padded, "Butterfly ifft forward padded (CUDA)");
+    m.def("butterfly_ifft_padded_gated_forward", &butterfly_ifft_padded_gated, "Butterfly ifft forward padded (CUDA)");
+    m.def("butterfly_ifft_padded_bf16_forward", &butterfly_ifft_padded_bf16, "Butterfly ifft forward padded (CUDA)");
+    m.def("butterfly_ifft_padded_gated_bf16_forward", &butterfly_ifft_padded_gated_bf16, "Butterfly ifft forward padded (CUDA)");
+    m.def("conv1d_forward", &conv1d_fwd, "conv1d forward (CUDA)");
+    m.def("conv1d_backward", &conv1d_bwd, "conv1d backward (CUDA)");
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_bwd_complex_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,672 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_bwd_cuda_complex_kernel(
+    const at::BFloat16 *__restrict__ dout_real_inp,
+    const at::BFloat16 *__restrict__ dout_imag_inp,
+    const at::BFloat16 *__restrict__ a_real_inp,
+    const at::BFloat16 *__restrict__ a_imag_inp,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_fft,  // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_fft,   // 256
+    const c10::complex<at::BFloat16> *__restrict__ b_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_ifft, // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_ifft,  // 256
+    at::BFloat16 *dx_out_real,
+    at::BFloat16 *dx_out_imag,
+    c10::complex<at::BFloat16> *dk_f_out,
+    uint B,
+    uint H,
+    uint signal_size,
+    uint sqrt_N)
+{
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *a_real_2 = &a_real[2 * N];
+  at::BFloat16 *a_imag_2 = &a_real[3 * N];
+  at::BFloat16 *b_real = &a_real[4 * N];
+  at::BFloat16 *b_imag = &a_real[4 * N + 256];
+  at::BFloat16 *b_real_2 = &a_real[4 * N + 2 * 256];
+  at::BFloat16 *b_imag_2 = &a_real[4 * N + 3 * 256];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix = num_threads <= 128 ? DFT_SIZE * DFT_SIZE / num_threads : 2;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT / Twiddle, etc
+  using BlockStore_Sequence = cub::BlockStore<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  using BlockStore_Sequence_Complex = cub::BlockStore<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * signal_size * B_TILE_SIZE;
+  // index into the H
+  int h_offset_signal = blockIdx.y * signal_size * H_TILE_SIZE;
+  int h_offset_kernel = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  at::BFloat16 x_input_data[items_per_thread_input];     // for storing the input
+  complex_bfloat16_t temp[items_per_thread_input];
+  complex_bfloat16_t b_input_data[items_per_thread_matrix];   // for storing matrices, twiddle factors
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix]; // another place for storing matrices, twiddle factors
+  // for the dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_dft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_idft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_256_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 idft twiddle
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_256_idft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // // for twiddles
+  // wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, half, wmma::col_major> twiddle_256_dft_frag[N / (DFT_SIZE * DFT_SIZE)][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // load twiddle_256_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load 256 twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load into twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2);
+  // start loading ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_half[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+// load DFT matrix into b_frag
+#pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT matrix into b_frag_idft
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  // load 256 twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N);
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_256_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load 256 ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  __syncthreads();
+  // load 256 idft twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = j_b * WMMA_N * sqrt_N + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 256);
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 256);
+      }
+    }
+  }
+  // load DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  __syncthreads();
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset_kernel + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f.conj() into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].real()),
+        __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __hneg2(__nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      ));
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f.conj() into registers in k_frag
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE +
+                  warp_id * DFT_SIZE * DFT_SIZE;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N);
+        }
+      }
+    }
+    __syncthreads();
+    for(int i = 0; i < items_per_thread_input; i++) {
+        temp[i] = complex_bfloat16_t(0.0f, 0.0f);
+    }
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset_signal + b_offset + h_tile_id * signal_size + b_tile_id * H * signal_size;
+      int k_idx_offset;
+      // __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT(dout)
+        complex_matmul_c2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<const __nv_bfloat16 *>(dout_real_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<const __nv_bfloat16 *>(dout_imag_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            wmma::mem_col_major);
+        // outer DFT(x)
+        complex_matmul_c2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<const __nv_bfloat16 *>(a_real_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<const __nv_bfloat16 *>(a_imag_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),                 // this is the output
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+        // first DFT, output is NOT written to shared memory
+        // DFT(dout)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            a_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output IS written to a_real, a_imag
+        // DFT(dout)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_dft_frag,
+            wmma::mem_row_major);
+        // first DFT, output is NOT written to shared memory
+        // DFT(x)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            a_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output IS written to a_real, a_imag
+        // DFT(x)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_dft_frag,
+            wmma::mem_row_major);
+        // dk_f = dout * x.conj()
+        for (int i = 0; i < 256 / 32 / 2; i++)
+        {
+          a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+          complex_mul_conj_bfloat162(
+              reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx]);
+        }
+        __syncthreads();
+        // start computing iFFT(dout)
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // __syncthreads();
+        // second iFFT dout, and multiply by twiddle
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<half *>(out + input_offset + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_idft_frag,
+            wmma::mem_col_major);
+        // __syncthreads();
+      }
+      __syncthreads();
+      // finish iFFT dout
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT
+        complex_matmul_c2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // multiply dout by N, and prepare for writing to HBM
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        // reinterpret_cast<__half2 *>(a_input_data)[i] = __hmul2(
+        //     reinterpret_cast<__half2 *>(a_real)[a_idx],
+        //     __half2(__float2half(float(N)), __float2half(float(N))));
+        reinterpret_cast<__nv_bfloat162 *>(a_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+        reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx];
+      }
+      // HACK
+      // for now, just output the a_real output
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(dx_out_real + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(a_input_data)
+      );
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(dx_out_imag + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data)
+      );
+      __syncthreads();
+      // put dk_f into a_input_data, and write to HBM
+      __nv_bfloat162 real, imag;
+      #pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        real = reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx];
+        imag = reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx];
+        reinterpret_cast<c10::complex<__nv_bfloat16> *>(a_input_data)[2 * i] = c10::complex<__nv_bfloat16>(real.x, imag.x);
+        reinterpret_cast<c10::complex<__nv_bfloat16> *>(a_input_data)[2 * i + 1] = c10::complex<__nv_bfloat16>(real.y, imag.y);
+      }
+      __syncthreads();
+      for(int i = 0; i < items_per_thread_input; i++) {
+          temp[i] += a_input_data[i];
+      }
+    __syncthreads();
+    } // b_tile_id
+    for(int i = 0; i < items_per_thread_input; i++) {
+        reinterpret_cast<__nv_bfloat162 *>(temp)[i] = __hmul2(reinterpret_cast<__nv_bfloat162 *>(temp)[i], __nv_bfloat162(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+    }
+    // store dk_f
+      BlockStore_Sequence_Complex().Store(
+          reinterpret_cast<c10::complex<float> *>(dk_f_out + h_offset_kernel + blockIdx.x * H * N + h_tile_id * N),
+          reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(temp));
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_bwd_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,828 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_bwd_cuda_kernel(
+    const at::BFloat16 *__restrict__ dout,
+    const at::BFloat16 *__restrict__ a,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_fft,  // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_fft,   // 256
+    const c10::complex<at::BFloat16> *__restrict__ b_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_ifft, // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_ifft,  // 256
+    at::BFloat16 *dx_out,
+    c10::complex<at::BFloat16> *dk_f_out,
+    const at::BFloat16 *__restrict__ in_gate,
+    const at::BFloat16 *__restrict__ out_gate,
+    at::BFloat16 *din_gate,
+    at::BFloat16 *dout_gate,
+    uint B,
+    uint H,
+    uint signal_size,
+    uint sqrt_N)
+{
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *a_real_2 = &a_real[2 * N];
+  at::BFloat16 *a_imag_2 = &a_real[3 * N];
+  at::BFloat16 *b_real = &a_real[4 * N];
+  at::BFloat16 *b_imag = &a_real[4 * N + 256];
+  at::BFloat16 *b_real_2 = &a_real[4 * N + 2 * 256];
+  at::BFloat16 *b_imag_2 = &a_real[4 * N + 3 * 256];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix = num_threads <= 128 ? DFT_SIZE * DFT_SIZE / num_threads : 2;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT / Twiddle, etc
+  using BlockStore_Sequence = cub::BlockStore<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  using BlockStore_Sequence_Complex = cub::BlockStore<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * signal_size * B_TILE_SIZE;
+  // index into the H
+  int h_offset_signal = blockIdx.y * signal_size * H_TILE_SIZE;
+  int h_offset_kernel = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  at::BFloat16 x_input_data[items_per_thread_input];     // for storing the input
+  at::BFloat16 gate_data[items_per_thread_input];    // for storing the input gates
+  at::BFloat16 dgate_data[items_per_thread_input];
+  at::BFloat16 dout_data[items_per_thread_input];
+  complex_bfloat16_t temp[items_per_thread_input];
+  complex_bfloat16_t b_input_data[items_per_thread_matrix];   // for storing matrices, twiddle factors
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix]; // another place for storing matrices, twiddle factors
+  // for the dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_dft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_idft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_256_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 idft twiddle
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_256_idft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // // for twiddles
+  // wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, half, wmma::col_major> twiddle_256_dft_frag[N / (DFT_SIZE * DFT_SIZE)][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // load twiddle_256_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load 256 twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load into twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2);
+  // start loading ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_half[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+// load DFT matrix into b_frag
+#pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT matrix into b_frag_idft
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  // load 256 twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N);
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_256_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load 256 ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  __syncthreads();
+  // load 256 idft twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = j_b * WMMA_N * sqrt_N + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 256);
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 256);
+      }
+    }
+  }
+  // load DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  __syncthreads();
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset_kernel + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f.conj() into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].real()),
+        __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __hneg2(__nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      ));
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f.conj() into registers in k_frag
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE +
+                  warp_id * DFT_SIZE * DFT_SIZE;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N);
+        }
+      }
+    }
+    __syncthreads();
+    for(int i = 0; i < items_per_thread_input; i++) {
+        temp[i] = complex_bfloat16_t(0.0f, 0.0f);
+    }
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset_signal + b_offset + h_tile_id * signal_size + b_tile_id * H * signal_size;
+      int k_idx_offset;
+      // load dout into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(dout + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+      if(out_gate != nullptr){
+        // load output gate into gate_data
+        BlockLoad_Input().Load(
+          reinterpret_cast<const float *>(out_gate + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+          signal_size / 2, 0.
+        );
+      }
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        reinterpret_cast<__nv_bfloat162 *>(dout_data)[i] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+        if(out_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i],
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+        }
+      }
+      __syncthreads();
+      // load input into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(a + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+      if(in_gate != nullptr){
+        // load input gate into gate_data
+        BlockLoad_Input().Load(
+          reinterpret_cast<const float *>(in_gate + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+          signal_size / 2, 0.
+        );
+      }
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        if(in_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i],
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+        }
+      }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT(dout)
+        complex_matmul_r2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // read from SRAM
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                    // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                    // this is the output
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            wmma::mem_col_major);
+        // outer DFT(x)
+        complex_matmul_r2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset), // read from SRAM
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),               // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),               // this is the output
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("dout @ f_sqrt_N_fft\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f + %fi, ", __half2float(a_real[a_idx]), __half2float(a_imag[a_idx]));
+      //    }
+      //    printf("\n");
+      //    printf("x @ f_sqrt_N_fft\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f + %fi, ", __half2float(a_real_2[a_idx]), __half2float(a_imag_2[a_idx]));
+      //    }
+      //    printf("\n");
+      // }
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+        // first DFT, output is NOT written to shared memory
+        // DFT(dout)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            a_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output IS written to a_real, a_imag
+        // DFT(dout)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_dft_frag,
+            wmma::mem_row_major);
+        // first DFT, output is NOT written to shared memory
+        // DFT(x)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            a_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output IS written to a_real, a_imag
+        // DFT(x)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_dft_frag,
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx == 15) {
+        //    printf("DFT(dout)\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", __half2float(a_real[a_idx]), __half2float(a_imag[a_idx]));
+        //    }
+        //    printf("\n");
+        //    printf("DFT(x)\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", __half2float(a_real_2[a_idx]), __half2float(a_imag_2[a_idx]));
+        //    }
+        //    printf("\n");
+        // }
+        // // x = x * N
+        // for (int i = 0; i < 256 / 32 / 2; i++)
+        // {
+        //   a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+        //   reinterpret_cast<__half2 *>(a_real_2)[a_idx] = __hmul2(
+        //       reinterpret_cast<__half2 *>(a_real_2)[a_idx],
+        //       __half2(__float2half(float(N)), __float2half(float(N))));
+        //   reinterpret_cast<__half2 *>(a_imag_2)[a_idx] = __hmul2(
+        //       reinterpret_cast<__half2 *>(a_imag_2)[a_idx],
+        //       __half2(__float2half(float(N)), __float2half(float(N))));
+        // }
+        // dk_f = dout * x.conj()
+        for (int i = 0; i < 256 / 32 / 2; i++)
+        {
+          a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+          complex_mul_conj_bfloat162(
+              reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx]);
+        }
+        __syncthreads();
+        // start computing iFFT(dout)
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        //    printf("After ifft\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", scratch_real[a_idx], scratch_imag[a_idx]);
+        //    }
+        //    printf("\n");
+        // }
+        // __syncthreads();
+        // second iFFT dout, and multiply by twiddle
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<half *>(out + input_offset + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_idft_frag,
+            wmma::mem_col_major);
+        // __syncthreads();
+      }
+      __syncthreads();
+      // finish iFFT dout
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT
+        complex_matmul_c2r_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // write to SRAM
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("Before output\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f, ", __half2float(a_real[a_idx]));
+      //    }
+      //    printf("\n");
+      // }
+      // load input into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(a + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+      if(in_gate != nullptr){
+        for (int i = 0; i < items_per_thread_input / 2; i++)
+        {
+            a_idx = i * num_threads + thread_id;
+            reinterpret_cast<__nv_bfloat162 *>(dgate_data)[i] = __hmul2(
+              reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i]
+            );
+        }
+        // write to HBM
+        BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(din_gate + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(dgate_data),
+          signal_size / 2
+        );
+      }
+      // multiply dout by N, and prepare for writing to HBM
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        // reinterpret_cast<__half2 *>(a_input_data)[i] = __hmul2(
+        //     reinterpret_cast<__half2 *>(a_real)[a_idx],
+        //     __half2(__float2half(float(N)), __float2half(float(N))));
+        if(in_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(a_input_data)[i] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(a_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+        }
+      }
+      // HACK
+      // for now, just output the a_real output
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(dx_out + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(a_input_data),
+          signal_size / 2
+      );
+      __syncthreads();
+      // put dk_f into a_input_data, and write to HBM
+      __nv_bfloat162 real, imag;
+#pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        real = reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx];
+        imag = reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx];
+        reinterpret_cast<c10::complex<__nv_bfloat16> *>(a_input_data)[2 * i] = c10::complex<__nv_bfloat16>(real.x, imag.x);
+        reinterpret_cast<c10::complex<__nv_bfloat16> *>(a_input_data)[2 * i + 1] = c10::complex<__nv_bfloat16>(real.y, imag.y);
+      }
+      __syncthreads();
+      for(int i = 0; i < items_per_thread_input; i++) {
+          temp[i] += a_input_data[i];
+      }
+    __syncthreads();
+    } // b_tile_id
+    for(int i = 0; i < items_per_thread_input; i++) {
+        reinterpret_cast<__nv_bfloat162 *>(temp)[i] = __hmul2(reinterpret_cast<__nv_bfloat162 *>(temp)[i], __nv_bfloat162(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+    }
+    // store dk_f
+      BlockStore_Sequence_Complex().Store(
+          reinterpret_cast<c10::complex<float> *>(dk_f_out + h_offset_kernel + blockIdx.x * H * N + h_tile_id * N),
+          reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(temp));
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_complex_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,611 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_cuda_complex_kernel(
+    const at::BFloat16 *__restrict__ a_real_inp,
+    const at::BFloat16 *__restrict__ a_imag_inp,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_fft,  // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_fft,   // 256
+    const c10::complex<at::BFloat16> *__restrict__ b_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_ifft, // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_ifft,  // 256
+    at::BFloat16 *out_real,
+    at::BFloat16 *out_imag,
+    uint B,
+    uint H,
+    uint signal_size,
+    uint sqrt_N)
+{
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *b_real = &a_real[2 * N];
+  at::BFloat16 *b_imag = &a_real[2 * N + 256];
+  at::BFloat16 *b_real_2 = &a_real[2 * N + 2 * 256];
+  at::BFloat16 *b_imag_2 = &a_real[2 * N + 3 * 256];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix = num_threads <= 128 ? DFT_SIZE * DFT_SIZE / num_threads : 2;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT / Twiddle, etc
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * N * B_TILE_SIZE;
+  // index into the H
+  int h_offset = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  complex_bfloat16_t b_input_data[items_per_thread_matrix];   // for storing matrices, twiddle factors
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix]; // another place for storing matrices, twiddle factors
+  // for the dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_dft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_idft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_256_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 idft twiddle
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_256_idft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // // for twiddles
+  // wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, half, wmma::col_major> twiddle_256_dft_frag[N / (DFT_SIZE * DFT_SIZE)][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // load twiddle_256_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load 256 twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load into twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2);
+  // start loading ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_half[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+// load DFT matrix into b_frag
+#pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT matrix into b_frag_idft
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  // load 256 twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N);
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_256_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load 256 ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  __syncthreads();
+  // load 256 idft twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = j_b * WMMA_N * sqrt_N + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 256);
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 256);
+      }
+    }
+  }
+  // load DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  __syncthreads();
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].real()),
+        __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f into registers in k_frag
+    // NOTE(danfu): this loop costs 60 us
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE +
+                  warp_id * DFT_SIZE * DFT_SIZE;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N);
+        }
+      }
+    }
+    __syncthreads();
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset + b_offset + h_tile_id * N + b_tile_id * H * N;
+      int k_idx_offset;
+      // // load input into a_real
+      // BlockLoad_Input().Load(
+      //   reinterpret_cast<const float *>(a + input_offset),
+      //   reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+      //   signal_size / 2, 0.
+      // );
+      // for (int i = 0; i < items_per_thread_input / 2; i++)
+      // {
+      //   a_idx = i * num_threads + thread_id;
+      //   reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = __nv_bfloat162(
+      //     __nv_bfloat16(x_input_data[2 * i]),
+      //     __nv_bfloat16(x_input_data[2 * i + 1])
+      //   );
+      // }
+      // __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT
+        complex_matmul_c2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<const __nv_bfloat16 *>(a_real_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<const __nv_bfloat16 *>(a_imag_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+        // first DFT, output is NOT written to shared memory
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            a_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, true, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_dft_frag,
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        //    printf("After second DFT\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", __half2float(a_real[a_idx]), __half2float(a_imag[a_idx]));
+        //    }
+        //    printf("\n");
+        // }
+        // __syncthreads();
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        //    printf("After ifft\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", scratch_real[a_idx], scratch_imag[a_idx]);
+        //    }
+        //    printf("\n");
+        // }
+        // __syncthreads();
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<half *>(out + input_offset + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_idft_frag,
+            wmma::mem_col_major);
+        // __syncthreads();
+      }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT
+        complex_matmul_c2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(out_real + input_offset + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(out_imag + input_offset + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("Before output\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f, ", __half2float(a_real[a_idx]));
+      //    }
+      //    printf("\n");
+      // }
+      // #pragma unroll
+      // for (int i = 0; i < items_per_thread_input / 2; i++)
+      // {
+      //   a_idx = i * num_threads + thread_id;
+      //   scratch = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+      //   x_input_data[2 * i] = scratch.x;
+      //   x_input_data[2 * i + 1] = scratch.y;
+      // }
+      // // store a_real
+      // BlockStore_Sequence().Store(
+      //   reinterpret_cast<float *>(out + input_offset),
+      //   reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+      //   signal_size / 2
+      // );
+      // __syncthreads();
+    } // b_tile_id
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_16_16_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,639 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_cuda_kernel(
+    const at::BFloat16 *__restrict__ a,
+    const at::BFloat16 *__restrict__ in_gate,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_fft,  // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_fft,   // 256
+    const c10::complex<at::BFloat16> *__restrict__ b_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_256_ifft, // 4096
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_16_ifft,  // 256
+    at::BFloat16 *out,
+    const at::BFloat16 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    uint signal_size,
+    uint sqrt_N)
+{
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *b_real = &a_real[2 * N];
+  at::BFloat16 *b_imag = &a_real[2 * N + 256];
+  at::BFloat16 *b_real_2 = &a_real[2 * N + 2 * 256];
+  at::BFloat16 *b_imag_2 = &a_real[2 * N + 3 * 256];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix = num_threads <= 128 ? DFT_SIZE * DFT_SIZE / num_threads : 2;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT / Twiddle, etc
+  using BlockStore_Sequence = cub::BlockStore<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * signal_size * B_TILE_SIZE;
+  // index into the H
+  int h_offset_signal = blockIdx.y * signal_size * H_TILE_SIZE;
+  int h_offset_kernel = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  at::BFloat16 x_input_data[items_per_thread_input];     // for storing the input
+  at::BFloat16 gate_data[items_per_thread_input];        // for storing the gates
+  complex_bfloat16_t b_input_data[items_per_thread_matrix];   // for storing matrices, twiddle factors
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix]; // another place for storing matrices, twiddle factors
+  // for the dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for the dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_dft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_16_idft_frag[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_256_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for 256 idft twiddle
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_256_idft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // // for twiddles
+  // wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, half, wmma::col_major> twiddle_256_dft_frag[N / (DFT_SIZE * DFT_SIZE)][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  // load twiddle_256_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  // loads SEQUENCE_SIZE into b
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load 256 twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load into twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data),
+      DFT_SIZE * DFT_SIZE / 2);
+  // start loading ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix / 2]>(b_input_data_2),
+      DFT_SIZE * DFT_SIZE / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_half[MATMUL_WARP_WIDTH][MATMUL_WARP_WIDTH][2];
+// load DFT matrix into b_frag
+#pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(a_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_dft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT matrix into b_frag_idft
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(b_frag_idft[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  // load 256 twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N);
+        wmma::load_matrix_sync(twiddle_256_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_256_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_256_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load 256 ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  __syncthreads();
+  // load 256 idft twiddle factors into registers
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+      {
+        b_idx = j_b * WMMA_N * sqrt_N + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 256);
+        wmma::load_matrix_sync(twiddle_256_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 256);
+      }
+    }
+  }
+  // load DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N + k * WMMA_K : k * WMMA_K * sqrt_N + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N);
+      wmma::load_matrix_sync(twiddle_16_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N);
+    }
+  }
+  __syncthreads();
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset_kernel + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].real()),
+        __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f into registers in k_frag
+    // NOTE(danfu): this loop costs 60 us
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE +
+                  warp_id * DFT_SIZE * DFT_SIZE;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N);
+        }
+      }
+    }
+    __syncthreads();
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset_signal + b_offset + h_tile_id * signal_size + b_tile_id * H * signal_size;
+      int k_idx_offset;
+      // load input into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(a + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+          if(in_gate != nullptr){
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(in_gate + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+        signal_size / 2, 0.
+      );
+    }
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      if(in_gate != nullptr){
+        reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx]  = __hmul2(
+          reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i],
+          reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+        );
+      }else{
+        reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+      }
+    }
+    if(out_gate != nullptr){
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(out_gate + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+        signal_size / 2, 0.
+      );
+    }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT
+        complex_matmul_r2c_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // read from SRAM
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE * DFT_SIZE + warp_id * DFT_SIZE * DFT_SIZE;
+        // first DFT, output is NOT written to shared memory
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N,
+            N,
+            a_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH, true, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_dft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_dft_frag,
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        //    printf("After second DFT\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", __half2float(a_real[a_idx]), __half2float(a_imag[a_idx]));
+        //    }
+        //    printf("\n");
+        // }
+        // __syncthreads();
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        //    printf("After ifft\n");
+        //    for (int i = 0; i < items_per_thread_input; i++) {
+        //       a_idx = i * num_threads + thread_id;
+        //       printf("%f + %fi, ", scratch_real[a_idx], scratch_imag[a_idx]);
+        //    }
+        //    printf("\n");
+        // }
+        // __syncthreads();
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<half *>(out + input_offset + k_idx_offset),
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_16_idft_frag,
+            wmma::mem_col_major);
+        // __syncthreads();
+      }
+      __syncthreads();
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * DFT_SIZE + warp_id * DFT_SIZE;
+        // outer DFT
+        complex_matmul_c2r_256<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // write to SRAM
+            sqrt_N,
+            N,
+            b_frag_idft,
+            acc_frag_1,
+            acc_frag_half,
+            twiddle_256_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("Before output\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f, ", __half2float(a_real[a_idx]));
+      //    }
+      //    printf("\n");
+      // }
+#pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        if(out_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i],
+            reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+        }
+      }
+      // store a_real
+      BlockStore_Sequence().Store(
+        reinterpret_cast<float *>(out + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2
+      );
+      __syncthreads();
+    } // b_tile_id
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_bwd_complex_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,746 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH_1, int MATMUL_WARP_WIDTH_2, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_bwd_cuda_16_32_32_complex_kernel(
+    const at::BFloat16 *__restrict__ dout_real_inp,
+    const at::BFloat16 *__restrict__ dout_imag_inp,
+    const at::BFloat16 *__restrict__ a_real_inp,
+    const at::BFloat16 *__restrict__ a_imag_inp,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b_16,                        // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_fft,  // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_fft,   // 1024
+    const c10::complex<at::BFloat16> *__restrict__ b_16_ifft,                   // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_ifft, // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_ifft,  // 1024
+    at::BFloat16 *dx_out_real,
+    at::BFloat16 *dx_out_imag,
+    c10::complex<at::BFloat16> *dk_f_out,
+    uint B,
+    uint H,
+    uint signal_size)
+{
+  const uint sqrt_N_1 = 16;
+  const uint sqrt_N_2 = 32;
+  const uint N_1 = 256;
+  const uint N_2 = 1024;
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *a_real_2 = &a_real[2 * N];
+  at::BFloat16 *a_imag_2 = &a_real[3 * N];
+  at::BFloat16 *b_real = &a_real[4 * N];
+  at::BFloat16 *b_imag = &a_real[4 * N + N_2];
+  at::BFloat16 *b_real_2 = &a_real[4 * N + 2 * N_2];
+  at::BFloat16 *b_imag_2 = &a_real[4 * N + 3 * N_2];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix_N_1 = num_threads <= 128 ? N_1 / num_threads : 2;
+  const int items_per_thread_matrix_N_2 = N_2 / num_threads;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix_N_1 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_1 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockLoad_Matrix_N_2 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_2 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockStore_Sequence = cub::BlockStore<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  using BlockStore_Sequence_Complex = cub::BlockStore<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * signal_size * B_TILE_SIZE;
+  // index into the H
+  int h_offset_signal = blockIdx.y * signal_size * H_TILE_SIZE;
+  int h_offset_kernel = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  at::BFloat16 x_input_data[items_per_thread_input];     // for storing the input
+  complex_bfloat16_t temp[items_per_thread_input];
+  complex_bfloat16_t b_input_data[items_per_thread_matrix_N_2];   // for storing matrices
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix_N_2]; // another place for storing matrices
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 32 x 32 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_dft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_idft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 16 x 1024 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_1024_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 16 x 1024 idft twiddle - split into 64 x (16 x 16)
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_1024_idft_frag[64 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // accumulator fragments for the 16 x 16 and 32 x 32
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_1_half[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_2_half[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // load twiddle_N_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads b_16 into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data),
+      N_1 / 2); // hopefully this interleaves things correctly
+  // loads b_16_ifft into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data_2),
+      N_1 / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the 16x16 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix_N_1 / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128)
+    {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load N twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load in 32x32 twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  // load 16x16 DFT matrix into b_frag_dft_N_1
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_1);
+    }
+  }
+  // load 16x16 iDFT matrix into b_frag_idft_N_1
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_1);
+    }
+  }
+  // load N twiddle factors into registers
+  // these will be loaded into the inner loop, so treat them as 16 x 1024
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N_2);
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N_2);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_N_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load N ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load 32x32 twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // start loading 32x32 DFT matrices
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 iDFT matrices
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  // load N idft twiddle factors into registers
+  // these will be used in the last iFFT, so treat them as 32 x 32 x 8
+  for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+      {
+        b_idx = j_b * WMMA_N * 1024 + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 1024);
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 1024);
+      }
+    }
+  }
+  // load 32x32 DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrices into b_frag_dft_N_2, b_frag_idft_N_2
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset_kernel + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f.conj() into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].real()),
+        __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __hneg2(__nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      ));
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f.conj() into registers in k_frag
+    // in the inner loop, so treat as 32 x 256
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH_2; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N_2 +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 +
+                  warp_id * sqrt_N_2 * sqrt_N_2;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N_2);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N_2);
+        }
+      }
+    }
+    for(int i = 0; i < items_per_thread_input; i++) {
+      temp[i] = complex_bfloat16_t(0.0f, 0.0f);
+    }
+    __syncthreads();
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset_signal + b_offset + h_tile_id * signal_size + b_tile_id * H * signal_size;
+      int k_idx_offset;
+      // __syncthreads();
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT(dout)
+        complex_matmul_c2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<const __nv_bfloat16 *>(dout_real_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<const __nv_bfloat16 *>(dout_imag_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_dft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            wmma::mem_col_major);
+        // outer DFT(x)
+        complex_matmul_c2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<const __nv_bfloat16 *>(a_real_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<const __nv_bfloat16 *>(a_imag_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),                 // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_dft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // 16 times (32, 32)
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+        // first DFT, output is NOT written to shared memory
+        // DFT(dout)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N_2,
+            N,
+            a_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_1024_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        // DFT(dout)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_dft_frag,
+            wmma::mem_row_major);
+        // first DFT, output is NOT written to shared memory
+        // DFT(x)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset), // this is the output
+            sqrt_N_2,
+            N,
+            a_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_1024_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        // DFT(x)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_dft_frag,
+            wmma::mem_row_major);
+        // x = x * N
+        for (int i = 0; i < 1024 / 32 / 2; i++)
+        {
+          a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+          reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx] = __hmul2(
+              reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              __nv_bfloat162(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+          reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx] = __hmul2(
+              reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx],
+              __nv_bfloat162(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+        }
+        __syncthreads();
+        // dk_f = dout * x.conj()
+        for (int i = 0; i < 1024 / 32 / 2; i++)
+        {
+          a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+          complex_mul_conj_bfloat162(
+              reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx]);
+        }
+        __syncthreads();
+        // start computing iFFT(dout)
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // __syncthreads();
+        // second iFFT dout
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<__nv_bfloat16 *>(out + input_offset + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_idft_frag,
+            wmma::mem_col_major);
+        // __syncthreads();
+      }
+      __syncthreads();
+      // finish iFFT dout
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT
+        complex_matmul_c2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_idft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            twiddle_1024_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      #pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        // reinterpret_cast<__nv_bfloat16 *>(a_input_data)[i] = __hmul2(
+        //     reinterpret_cast<__nv_bfloat16 *>(a_real)[a_idx],
+        //     __nv_bfloat16(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+        reinterpret_cast<__nv_bfloat162 *>(a_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+        reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx];
+      }
+      // HACK
+      // for now, just output the a_real output
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(dx_out_real + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(a_input_data)
+      );
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(dx_out_imag + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data)
+      );
+      __syncthreads();
+      // put dk_f into a_input_data, and udpate temp
+      __nv_bfloat162 real, imag;
+      #pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        real = reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx];
+        imag = reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx];
+        reinterpret_cast<complex_bfloat16_t *>(a_input_data)[2 * i] = complex_bfloat16_t(real.x, imag.x);
+        reinterpret_cast<complex_bfloat16_t *>(a_input_data)[2 * i + 1] = complex_bfloat16_t(real.y, imag.y);
+      }
+      for(int i = 0; i < items_per_thread_input; i++) {
+        temp[i] += a_input_data[i];
+      }
+    } // b_tile_id
+    // store dk_f
+    BlockStore_Sequence_Complex().Store(
+        reinterpret_cast<c10::complex<float> *>(dk_f_out + h_offset_kernel + blockIdx.x * H * N + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(temp));
+    __syncthreads();
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_bwd_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,877 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH_1, int MATMUL_WARP_WIDTH_2, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_bwd_cuda_16_32_32_kernel(
+    const at::BFloat16 *__restrict__ dout,
+    const at::BFloat16 *__restrict__ a,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b_16,                        // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_fft,  // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_fft,   // 1024
+    const c10::complex<at::BFloat16> *__restrict__ b_16_ifft,                   // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_ifft, // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_ifft,  // 1024
+    at::BFloat16 *dx_out,
+    c10::complex<at::BFloat16> *dk_f_out,
+    const at::BFloat16 *__restrict__ in_gate,
+    const at::BFloat16 *__restrict__ out_gate,
+    at::BFloat16 *din_gate,
+    at::BFloat16 *dout_gate,
+    uint B,
+    uint H,
+    uint signal_size)
+{
+  const uint sqrt_N_1 = 16;
+  const uint sqrt_N_2 = 32;
+  const uint N_1 = 256;
+  const uint N_2 = 1024;
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *a_real_2 = &a_real[2 * N];
+  at::BFloat16 *a_imag_2 = &a_real[3 * N];
+  at::BFloat16 *b_real = &a_real[4 * N];
+  at::BFloat16 *b_imag = &a_real[4 * N + N_2];
+  at::BFloat16 *b_real_2 = &a_real[4 * N + 2 * N_2];
+  at::BFloat16 *b_imag_2 = &a_real[4 * N + 3 * N_2];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix_N_1 = num_threads <= 128 ? N_1 / num_threads : 2;
+  const int items_per_thread_matrix_N_2 = N_2 / num_threads;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix_N_1 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_1 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockLoad_Matrix_N_2 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_2 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockStore_Sequence = cub::BlockStore<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  using BlockStore_Sequence_Complex = cub::BlockStore<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * signal_size * B_TILE_SIZE;
+  // index into the H
+  int h_offset_signal = blockIdx.y * signal_size * H_TILE_SIZE;
+  int h_offset_kernel = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  at::BFloat16 x_input_data[items_per_thread_input];     // for storing the input
+  at::BFloat16 gate_data[items_per_thread_input];    // for storing the input gates
+  at::BFloat16 dgate_data[items_per_thread_input];
+  at::BFloat16 dout_data[items_per_thread_input];
+  complex_bfloat16_t temp[items_per_thread_input];
+  complex_bfloat16_t b_input_data[items_per_thread_matrix_N_2];   // for storing matrices
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix_N_2]; // another place for storing matrices
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 32 x 32 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_dft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_idft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 16 x 1024 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_1024_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 16 x 1024 idft twiddle - split into 64 x (16 x 16)
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_1024_idft_frag[64 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // accumulator fragments for the 16 x 16 and 32 x 32
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_1_half[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_2_half[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // load twiddle_N_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads b_16 into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data),
+      N_1 / 2); // hopefully this interleaves things correctly
+  // loads b_16_ifft into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data_2),
+      N_1 / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the 16x16 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix_N_1 / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128)
+    {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load N twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load in 32x32 twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  // load 16x16 DFT matrix into b_frag_dft_N_1
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_1);
+    }
+  }
+  // load 16x16 iDFT matrix into b_frag_idft_N_1
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_1);
+    }
+  }
+  // load N twiddle factors into registers
+  // these will be loaded into the inner loop, so treat them as 16 x 1024
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N_2);
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N_2);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_N_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load N ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load 32x32 twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // start loading 32x32 DFT matrices
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 iDFT matrices
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  // load N idft twiddle factors into registers
+  // these will be used in the last iFFT, so treat them as 32 x 32 x 8
+  for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+      {
+        b_idx = j_b * WMMA_N * 1024 + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 1024);
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 1024);
+      }
+    }
+  }
+  // load 32x32 DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrices into b_frag_dft_N_2, b_frag_idft_N_2
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset_kernel + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f.conj() into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].real()),
+        __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __hneg2(__nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      ));
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f.conj() into registers in k_frag
+    // in the inner loop, so treat as 32 x 256
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH_2; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N_2 +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 +
+                  warp_id * sqrt_N_2 * sqrt_N_2;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N_2);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N_2);
+        }
+      }
+    }
+    for(int i = 0; i < items_per_thread_input; i++) {
+      temp[i] = complex_bfloat16_t(0.0f, 0.0f);
+    }
+    __syncthreads();
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset_signal + b_offset + h_tile_id * signal_size + b_tile_id * H * signal_size;
+      int k_idx_offset;
+      // load dout into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(dout + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+      if(out_gate != nullptr){
+        // load output gate into gate_data
+        BlockLoad_Input().Load(
+          reinterpret_cast<const float *>(out_gate + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+          signal_size / 2, 0.
+        );
+      }
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        reinterpret_cast<__nv_bfloat162 *>(dout_data)[i] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+        if(out_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i],
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+        }
+      }
+      __syncthreads();
+      // load input into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(a + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+      if(in_gate != nullptr){
+        // load input gate into gate_data
+        BlockLoad_Input().Load(
+          reinterpret_cast<const float *>(in_gate + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+          signal_size / 2, 0.
+        );
+      }
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        if(in_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i],
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+        }
+      }
+      __syncthreads();
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT(dout)
+        complex_matmul_r2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // read from HBM
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_dft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            wmma::mem_col_major);
+        // outer DFT(x)
+        complex_matmul_r2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset), // read from HBM
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),                 // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_dft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // 16 times (32, 32)
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+        // first DFT, output is NOT written to shared memory
+        // DFT(dout)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N_2,
+            N,
+            a_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_1024_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        // DFT(dout)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_dft_frag,
+            wmma::mem_row_major);
+        // first DFT, output is NOT written to shared memory
+        // DFT(x)
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset), // this is the output
+            sqrt_N_2,
+            N,
+            a_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_1024_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        // DFT(x)
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real_2 + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag_2 + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_dft_frag,
+            wmma::mem_row_major);
+        // x = x * N
+        for (int i = 0; i < 1024 / 32 / 2; i++)
+        {
+          a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+          reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx] = __hmul2(
+              reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              __nv_bfloat162(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+          reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx] = __hmul2(
+              reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx],
+              __nv_bfloat162(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+        }
+        // dk_f = dout * x.conj()
+        for (int i = 0; i < 1024 / 32 / 2; i++)
+        {
+          a_idx = k_idx_offset / 2 + i * 32 + thread_id % 32;
+          complex_mul_conj_bfloat162(
+              reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx],
+              &reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx]);
+        }
+        __syncthreads();
+        // start computing iFFT(dout)
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After iDFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat16float(a_real[a_idx]), __nv_bfloat16float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        // second iFFT dout
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<__nv_bfloat16 *>(out + input_offset + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_idft_frag,
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After 2nd iDFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat16float(a_real[a_idx]), __nv_bfloat16float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("After inner conv\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f + %fi, ", __nv_bfloat16float(a_real[a_idx]), __nv_bfloat16float(a_imag[a_idx]));
+      //   }
+      //   printf("\n");
+      // }
+      // finish iFFT dout
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT
+        complex_matmul_c2r_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // write to SRAM
+            sqrt_N_1,
+            N,
+            b_frag_idft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            twiddle_1024_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+        // load input into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(a + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+      if(in_gate != nullptr){
+        for (int i = 0; i < items_per_thread_input / 2; i++)
+        {
+            a_idx = i * num_threads + thread_id;
+            reinterpret_cast<__nv_bfloat162 *>(dgate_data)[i] = __hmul2(
+              reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+              reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i]
+            );
+        }
+        // write to HBM
+        BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(din_gate + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(dgate_data),
+          signal_size / 2
+        );
+      }
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("Before output\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f, ", __nv_bfloat16float(a_real[a_idx]));
+      //    }
+      //    printf("\n");
+      // }
+      __syncthreads();
+      #pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        // reinterpret_cast<__nv_bfloat16 *>(a_input_data)[i] = __hmul2(
+        //     reinterpret_cast<__nv_bfloat16 *>(a_real)[a_idx],
+        //     __nv_bfloat16(__float2bfloat16(float(N)), __float2bfloat16(float(N))));
+        if(in_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(a_input_data)[i] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx],
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(a_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+        }
+      }
+      // HACK
+      // for now, just output the a_real output
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(dx_out + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(a_input_data),
+          signal_size / 2
+      );
+      __syncthreads();
+      // put dk_f into a_input_data, and udpate temp
+      __nv_bfloat162 real, imag;
+      #pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        real = reinterpret_cast<__nv_bfloat162 *>(a_real_2)[a_idx];
+        imag = reinterpret_cast<__nv_bfloat162 *>(a_imag_2)[a_idx];
+        reinterpret_cast<complex_bfloat16_t *>(a_input_data)[2 * i] = complex_bfloat16_t(real.x, imag.x);
+        reinterpret_cast<complex_bfloat16_t *>(a_input_data)[2 * i + 1] = complex_bfloat16_t(real.y, imag.y);
+      }
+      for(int i = 0; i < items_per_thread_input; i++) {
+        temp[i] += a_input_data[i];
+      }
+    } // b_tile_id
+    // store dk_f
+    BlockStore_Sequence_Complex().Store(
+        reinterpret_cast<c10::complex<float> *>(dk_f_out + h_offset_kernel + blockIdx.x * H * N + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(temp));
+    __syncthreads();
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_complex_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,741 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH_1, int MATMUL_WARP_WIDTH_2, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_cuda_16_32_32_complex_kernel(
+    const at::BFloat16 *__restrict__ a_real_inp,
+    const at::BFloat16 *__restrict__ a_imag_inp,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b_16,                        // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_fft,  // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_fft,   // 1024
+    const c10::complex<at::BFloat16> *__restrict__ b_16_ifft,                   // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_ifft, // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_ifft,  // 1024
+    at::BFloat16 *out_real,
+    at::BFloat16 *out_imag,
+    uint B,
+    uint H,
+    uint signal_size)
+{
+  const uint sqrt_N_1 = 16;
+  const uint sqrt_N_2 = 32;
+  const uint N_1 = 256;
+  const uint N_2 = 1024;
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *b_real = &a_real[2 * N];
+  at::BFloat16 *b_imag = &a_real[2 * N + N_2];
+  at::BFloat16 *b_real_2 = &a_real[2 * N + 2 * N_2];
+  at::BFloat16 *b_imag_2 = &a_real[2 * N + 3 * N_2];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix_N_1 = num_threads <= 128 ? N_1 / num_threads : 2;
+  const int items_per_thread_matrix_N_2 = N_2 / num_threads;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix_N_1 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_1 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockLoad_Matrix_N_2 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_2 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * N * B_TILE_SIZE;
+  // index into the H
+  int h_offset = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  complex_bfloat16_t b_input_data[items_per_thread_matrix_N_2];   // for storing matrices
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix_N_2]; // another place for storing matrices
+  // for the 16 x 16 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 16 x 16 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_dft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_idft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 16 x 1024 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_1024_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 16 x 1024 idft twiddle - split into 64 x (16 x 16)
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_1024_idft_frag[64 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // accumulator fragments for the 16 x 16 and 32 x 32
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_1_half[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_2_half[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // load twiddle_N_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads b_16 into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data),
+      N_1 / 2); // hopefully this interleaves things correctly
+  // loads b_16_ifft into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data_2),
+      N_1 / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the 16x16 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix_N_1 / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load N twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load in 32x32 twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  // load 16x16 DFT matrix into b_frag_dft_N_1
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_1);
+    }
+  }
+  // load 16x16 iDFT matrix into b_frag_idft_N_1
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_1);
+    }
+  }
+  // load N twiddle factors into registers
+  // these will be loaded into the inner loop, so treat them as 16 x 1024
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N_2);
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N_2);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_N_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load N ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load 32x32 twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // start loading 32x32 DFT matrices
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 iDFT matrices
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  // load N idft twiddle factors into registers
+  // these will be used in the last iFFT, so treat them as 32 x 32 x 8
+  for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+      {
+        b_idx = j_b * WMMA_N * 1024 + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 1024);
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 1024);
+      }
+    }
+  }
+  // load 32x32 DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrices into b_frag_dft_N_2, b_frag_idft_N_2
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f into registers in k_frag
+    // in the inner loop, so treat as 16 x 1024
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH_2; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N_2 +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 +
+                  warp_id * sqrt_N_2 * sqrt_N_2;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N_2);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N_2);
+        }
+      }
+    }
+    __syncthreads();
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset + b_offset + h_tile_id * N + b_tile_id * H * N;
+      int k_idx_offset;
+      // // load input into a_real
+      // BlockLoad_Input().Load(
+      //   reinterpret_cast<const float *>(a + input_offset),
+      //   reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+      //   signal_size / 2, 0.
+      // );
+      // for (int i = 0; i < items_per_thread_input / 2; i++)
+      // {
+      //   a_idx = i * num_threads + thread_id;
+      //   reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = __nv_bfloat162(
+      //     __nv_bfloat16(x_input_data[2 * i]),
+      //     __nv_bfloat16(x_input_data[2 * i + 1])
+      //   );
+      // }
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("x_input_data\n");
+      //   for (int i = 0; i < items_per_thread_input / 2; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f, ", __bfloat162float(__nv_bfloat16(x_input_data[2 * i])));
+      //   }
+      //   printf("\n");
+      // }
+      // __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("Before first DFT\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])));
+      //   }
+      //   printf("\n");
+      //   // printf("Before first DFT\n");
+      //   // for (int i = 0; i < 32; i++) {
+      //   //   a_idx = i;
+      //   //   printf("%f, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])));
+      //   // }
+      //   // printf("\n");
+      // }
+      __syncthreads();
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT
+        complex_matmul_c2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<const __nv_bfloat16 *>(a_real_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<const __nv_bfloat16 *>(a_imag_inp + input_offset + k_idx_offset),                 // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_dft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("After first DFT\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f + %fi, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])), __bfloat162float(__nv_bfloat16(a_imag[a_idx])));
+      //   }
+      //   printf("\n");
+      // }
+      // 16 times (32, 32)
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("k_idx %d, k_idx_offset %d\n", k_idx, k_idx_offset);
+        // }
+        // first DFT, output is NOT written to shared memory
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N_2,
+            N,
+            a_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_1024_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After first DFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 32; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, true, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_dft_frag,
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After second DFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After iDFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<__nv_bfloat16 *>(out + input_offset + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_idft_frag,
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After 2nd iDFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("After inner conv\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f + %fi, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])), __bfloat162float(__nv_bfloat16(a_imag[a_idx])));
+      //   }
+      //   printf("\n");
+      // }
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT
+        complex_matmul_c2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(out_real + input_offset + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(out_imag + input_offset + k_idx_offset), // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_idft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            twiddle_1024_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("Before output\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])));
+      //    }
+      //    printf("\n");
+      // }
+      // #pragma unroll
+      // for (int i = 0; i < items_per_thread_input / 2; i++)
+      // {
+      //   a_idx = i * num_threads + thread_id;
+      //   scratch = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+      //   x_input_data[2 * i] = scratch.x;
+      //   x_input_data[2 * i + 1] = scratch.y;
+      // }
+      // // HACK
+      // // for now, just output the a_real output
+      // BlockStore_Sequence().Store(
+      //     reinterpret_cast<float *>(out + input_offset),
+      //     reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+      //     signal_size / 2
+      // );
+      // __syncthreads();
+    } // b_tile_id
+  }   // h_tile_id
+}

overlay/kernels/cuda/flashfftconv/csrc/monarch_cuda/kernels_bf16/monarch_cuda_16_32_32_kernel_bf16.h ADDED Viewed

	@@ -0,0 +1,769 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include "monarch_cuda_shared_bf16_no_float_shm.h"
+using namespace nvcuda;
+template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int N, int MATMUL_WARP_WIDTH_1, int MATMUL_WARP_WIDTH_2, int DFT_SIZE, bool RECOMPUTE, int B_TILE_SIZE, int H_TILE_SIZE, int WARP_TILE_SIZE>
+__global__ void monarch_conv_cuda_16_32_32_kernel(
+    const at::BFloat16 *__restrict__ a,
+    const at::BFloat16 *__restrict__ in_gate,
+    const c10::complex<at::BFloat16> *__restrict__ k_f,
+    const c10::complex<at::BFloat16> *__restrict__ b_16,                        // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32,                        // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_fft,  // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_fft,   // 1024
+    const c10::complex<at::BFloat16> *__restrict__ b_16_ifft,                   // 32 x 32
+    const c10::complex<at::BFloat16> *__restrict__ b_32_ifft,                   // 16 x 16
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_N_ifft, // 16K
+    const c10::complex<at::BFloat16> *__restrict__ twiddle_factors_32_ifft,  // 1024
+    at::BFloat16 *out,
+    const at::BFloat16 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    uint signal_size)
+{
+  const uint sqrt_N_1 = 16;
+  const uint sqrt_N_2 = 32;
+  const uint N_1 = 256;
+  const uint N_2 = 1024;
+  extern __shared__ at::Half a_real_fp16[];
+  at::BFloat16 *a_real = reinterpret_cast<at::BFloat16 *>(&a_real_fp16[0]);
+  at::BFloat16 *a_imag = &a_real[N];
+  at::BFloat16 *b_real = &a_real[2 * N];
+  at::BFloat16 *b_imag = &a_real[2 * N + N_2];
+  at::BFloat16 *b_real_2 = &a_real[2 * N + 2 * N_2];
+  at::BFloat16 *b_imag_2 = &a_real[2 * N + 3 * N_2];
+  const int num_threads = BLOCK_DIM_X * BLOCK_DIM_Y;
+  const int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+  // const int thread_id = threadIdx.x;
+  const int items_per_thread_input = N / num_threads;
+  // this is for reading in the DFT matrix or twiddle factors
+  const int items_per_thread_matrix_N_1 = num_threads <= 128 ? N_1 / num_threads : 2;
+  const int items_per_thread_matrix_N_2 = N_2 / num_threads;
+  const int warp_id = thread_id / WARP_SIZE;
+  // NOTE - we are loading and storing data in a STRIPED FORMAT
+  // SEQUENCE_SIZE * TILE_SIZE items, WARP_SIZE * TILE_SIZE threads -> items_per_thread_input
+  using BlockLoad_Input = cub::BlockLoad<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Sequence = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>;
+  using BlockLoad_Matrix_N_1 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_1 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockLoad_Matrix_N_2 = cub::BlockLoad<c10::complex<float>, BLOCK_DIM_X, items_per_thread_matrix_N_2 / 2, cub::BLOCK_LOAD_STRIPED, BLOCK_DIM_Y>; // for the DFT
+  using BlockStore_Sequence = cub::BlockStore<float, BLOCK_DIM_X, items_per_thread_input / 2, cub::BLOCK_STORE_STRIPED, BLOCK_DIM_Y>;
+  // index into block blockIdx.x
+  int b_offset = blockIdx.x * H * signal_size * B_TILE_SIZE;
+  // index into the H
+  int h_offset_signal = blockIdx.y * signal_size * H_TILE_SIZE;
+  int h_offset_kernel = blockIdx.y * N * H_TILE_SIZE;
+  complex_bfloat16_t a_input_data[items_per_thread_input];    // for storing the input, also used for k_f
+  at::BFloat16 x_input_data[items_per_thread_input];     // for storing the input
+  at::BFloat16 gate_data[items_per_thread_input];
+  complex_bfloat16_t b_input_data[items_per_thread_matrix_N_2];   // for storing matrices
+  complex_bfloat16_t b_input_data_2[items_per_thread_matrix_N_2]; // another place for storing matrices
+  // for the 16 x 16 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 16 x 16 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 idft
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> b_frag_idft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 32 x 32 dft
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> a_frag_dft_N_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_dft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 32 x 32 twiddles
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_32_idft_frag[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for the 16 x 1024 twiddle
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> twiddle_1024_dft_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for 16 x 1024 idft twiddle - split into 64 x (16 x 16)
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::col_major> twiddle_1024_idft_frag[64 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  // accumulator fragments for the 16 x 16 and 32 x 32
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_1[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, float> acc_frag_2[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_1_half[MATMUL_WARP_WIDTH_1][MATMUL_WARP_WIDTH_1][2];
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_K, WMMA_N, half> acc_frag_2_half[MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // for kernels - note that there are 16 / WARP_TILE_SIZE of these now!
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_K, WMMA_N, __nv_bfloat16, wmma::row_major> k_frag[16 / WARP_TILE_SIZE][MATMUL_WARP_WIDTH_2][MATMUL_WARP_WIDTH_2][2];
+  // load twiddle_N_dft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // loads b_16 into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data),
+      N_1 / 2); // hopefully this interleaves things correctly
+  // loads b_16_ifft into b
+  BlockLoad_Matrix_N_1().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_16_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_1 / 2]>(b_input_data_2),
+      N_1 / 2); // hopefully this interleaves things correctly
+  int a_idx, b_idx;
+  __nv_bfloat162 scratch;
+  // load the 16x16 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  if (num_threads <= 128) {
+    for (int i = 0; i < items_per_thread_matrix_N_1 / 2; i++)
+    {
+      b_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].real()),
+        __nv_bfloat16(b_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[2 * i].imag()),
+        __nv_bfloat16(b_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].real()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[2 * i].imag()),
+        __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  } else {
+    if (thread_id < 128) {
+      b_idx = thread_id;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].real()),
+        __nv_bfloat16(b_input_data[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data[0].imag()),
+        __nv_bfloat16(b_input_data[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].real()),
+        __nv_bfloat16(b_input_data_2[1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(b_input_data_2[0].imag()),
+        __nv_bfloat16(b_input_data_2[1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+    }
+  }
+  // load N twiddle into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  __syncthreads();
+  // load in 32x32 twiddle factors
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_fft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 ifft twiddle factors
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  bool a_trans = true;
+  bool b_trans = false;
+  // load 16x16 DFT matrix into b_frag_dft_N_1
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_dft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_1);
+    }
+  }
+  // load 16x16 iDFT matrix into b_frag_idft_N_1
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_1 + k * WMMA_K : k * WMMA_K * sqrt_N_1 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_1);
+      wmma::load_matrix_sync(b_frag_idft_N_1[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_1);
+    }
+  }
+  // load N twiddle factors into registers
+  // these will be loaded into the inner loop, so treat them as 16 x 1024
+  for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+      {
+        b_idx = k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, sqrt_N_2);
+        wmma::load_matrix_sync(twiddle_1024_dft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, sqrt_N_2);
+      }
+    }
+  }
+  __syncthreads();
+  // load twiddle_N_idft
+  BlockLoad_Sequence().Load(
+      reinterpret_cast<const c10::complex<float> *>(twiddle_factors_N_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+  // load N ifft twiddle factors into shared memory
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_input / 2; i++)
+  {
+    a_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].imag()),
+      __nv_bfloat16(a_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+  }
+  // load 32x32 twiddles into shared memory
+  // load the DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // start loading 32x32 DFT matrices
+  // NOTE(danfu): this takes about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data),
+      N_2 / 2);
+  // start loading 32x32 iDFT matrices
+  // TODO(danfu): this costs about 60 us
+  BlockLoad_Matrix_N_2().Load(
+      reinterpret_cast<const c10::complex<float> *>(b_32_ifft),
+      reinterpret_cast<c10::complex<float>(&)[items_per_thread_matrix_N_2 / 2]>(b_input_data_2),
+      N_2 / 2);
+  // load N idft twiddle factors into registers
+  // these will be used in the last iFFT, so treat them as 32 x 32 x 8
+  for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+  {
+    int k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+    for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_1; j_b++)
+    {
+      // #pragma unroll
+      for (int k = 0; k < MATMUL_WARP_WIDTH_1; k++)
+      {
+        b_idx = j_b * WMMA_N * 1024 + k * WMMA_K;
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(a_real) + k_idx_offset + b_idx, 1024);
+        wmma::load_matrix_sync(twiddle_1024_idft_frag[k_idx][k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(a_imag) + k_idx_offset + b_idx, 1024);
+      }
+    }
+  }
+  // load 32x32 DFT twiddles into twiddle_dft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_dft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // load iDFT twiddles into twiddle_idft_frag
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(twiddle_32_idft_frag[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrix into b_real, b_imag
+  // this costs about 60 us
+  // #pragma unroll
+  for (int i = 0; i < items_per_thread_matrix_N_2 / 2; i++)
+  {
+    b_idx = i * num_threads + thread_id;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].real()),
+      __nv_bfloat16(b_input_data[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data[2 * i].imag()),
+      __nv_bfloat16(b_input_data[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].real()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].real())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_real_2)[b_idx] = scratch;
+    scratch = __nv_bfloat162(
+      __nv_bfloat16(b_input_data_2[2 * i].imag()),
+      __nv_bfloat16(b_input_data_2[2 * i + 1].imag())
+    );
+    reinterpret_cast<__nv_bfloat162 *>(b_imag_2)[b_idx] = scratch;
+  }
+  __syncthreads();
+  // load the 32x32 DFT matrices into b_frag_dft_N_2, b_frag_idft_N_2
+  #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      a_idx = a_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(a_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + a_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_dft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int j_b = 0; j_b < MATMUL_WARP_WIDTH_2; j_b++)
+  {
+    // #pragma unroll
+    for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+    {
+      b_idx = b_trans ? j_b * WMMA_N * sqrt_N_2 + k * WMMA_K : k * WMMA_K * sqrt_N_2 + j_b * WMMA_N;
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][0], reinterpret_cast<__nv_bfloat16 *>(b_real_2) + b_idx, sqrt_N_2);
+      wmma::load_matrix_sync(b_frag_idft_N_2[k][j_b][1], reinterpret_cast<__nv_bfloat16 *>(b_imag_2) + b_idx, sqrt_N_2);
+    }
+  }
+  // #pragma unroll
+  for (int h_tile_id = 0; h_tile_id < H_TILE_SIZE; h_tile_id++)
+  {
+    // start loading k_f
+    // NOTE(danfu): this load from HBM costs about 60 us
+    BlockLoad_Sequence().Load(
+        reinterpret_cast<const c10::complex<float> *>(k_f + h_offset_kernel + h_tile_id * N),
+        reinterpret_cast<c10::complex<float>(&)[items_per_thread_input / 2]>(a_input_data));
+    // load k_f into shared memory
+    // #pragma unroll
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      scratch = __nv_bfloat162(
+      __nv_bfloat16(a_input_data[2 * i].real()),
+      __nv_bfloat16(a_input_data[2 * i + 1].real())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = scratch;
+      scratch = __nv_bfloat162(
+        __nv_bfloat16(a_input_data[2 * i].imag()),
+        __nv_bfloat16(a_input_data[2 * i + 1].imag())
+      );
+      reinterpret_cast<__nv_bfloat162 *>(a_imag)[a_idx] = scratch;
+    }
+    __syncthreads();
+    // load k_f into registers in k_frag
+    // in the inner loop, so treat as 16 x 1024
+    for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+    {
+      // #pragma unroll
+      for (int j_a = 0; j_a < MATMUL_WARP_WIDTH_2; j_a++)
+      {
+        // #pragma unroll
+        for (int k = 0; k < MATMUL_WARP_WIDTH_2; k++)
+        {
+          // a_idx = j_a * WMMA_K * sqrt_N + k * WMMA_K + k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+          a_idx = j_a * WMMA_K * sqrt_N_2 +
+                  k * WMMA_K +
+                  k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 +
+                  warp_id * sqrt_N_2 * sqrt_N_2;
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][0], reinterpret_cast<__nv_bfloat16 *>(a_real + a_idx), sqrt_N_2);
+          wmma::load_matrix_sync(k_frag[k_idx][j_a][k][1], reinterpret_cast<__nv_bfloat16 *>(a_imag + a_idx), sqrt_N_2);
+        }
+      }
+    }
+    __syncthreads();
+    // #pragma unroll
+    for (int b_tile_id = 0; b_tile_id < B_TILE_SIZE; b_tile_id++)
+    {
+      int input_offset = h_offset_signal + b_offset + h_tile_id * signal_size + b_tile_id * H * signal_size;
+      int k_idx_offset;
+      // load input into a_real
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(a + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+        signal_size / 2, 0.
+      );
+          if(in_gate != nullptr){
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(in_gate + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+        signal_size / 2, 0.
+      );
+    }
+    for (int i = 0; i < items_per_thread_input / 2; i++)
+    {
+      a_idx = i * num_threads + thread_id;
+      if(in_gate != nullptr){
+        reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx]  = __hmul2(
+          reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i],
+          reinterpret_cast<__nv_bfloat162 *>(gate_data)[i]
+        );
+      }else{
+        reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx] = reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i];
+      }
+    }
+    if(out_gate != nullptr){
+      BlockLoad_Input().Load(
+        reinterpret_cast<const float *>(out_gate + input_offset),
+        reinterpret_cast<float(&)[items_per_thread_input / 2]>(gate_data),
+        signal_size / 2, 0.
+      );
+    }
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("x_input_data\n");
+      //   for (int i = 0; i < items_per_thread_input / 2; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f, ", __bfloat162float(__nv_bfloat16(x_input_data[2 * i])));
+      //   }
+      //   printf("\n");
+      // }
+      // __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("Before first DFT\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])));
+      //   }
+      //   printf("\n");
+      //   // printf("Before first DFT\n");
+      //   // for (int i = 0; i < 32; i++) {
+      //   //   a_idx = i;
+      //   //   printf("%f, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])));
+      //   // }
+      //   // printf("\n");
+      // }
+      __syncthreads();
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT
+        complex_matmul_r2c_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // read from SRAM
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),                 // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),                 // this is the output
+            sqrt_N_1,
+            N,
+            b_frag_dft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("After first DFT\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f + %fi, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])), __bfloat162float(__nv_bfloat16(a_imag[a_idx])));
+      //   }
+      //   printf("\n");
+      // }
+      // 16 times (32, 32)
+      for (int k_idx = 0; k_idx < 16 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_2 * sqrt_N_2 + warp_id * sqrt_N_2 * sqrt_N_2;
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("k_idx %d, k_idx_offset %d\n", k_idx, k_idx_offset);
+        // }
+        // first DFT, output is NOT written to shared memory
+        complex_matmul_load_b<wmma::col_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, false, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the output
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the output
+            sqrt_N_2,
+            N,
+            a_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_1024_dft_frag[k_idx],
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After first DFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 32; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        // second DFT, output is NOT written to a_real, a_imag
+        complex_matmul<wmma::row_major, wmma::row_major, false, false, MATMUL_WARP_WIDTH_2, true, false>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_dft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_dft_frag,
+            wmma::mem_row_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After second DFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        // load the input from acc_frag_1, and multiply by k_frag
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, true, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            k_frag[k_idx],
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After iDFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+        complex_matmul<wmma::row_major, wmma::row_major, false, true, MATMUL_WARP_WIDTH_2, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset),
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset),
+            // reinterpret_cast<__nv_bfloat16 *>(out + input_offset + k_idx_offset),
+            sqrt_N_2,
+            N,
+            b_frag_idft_N_2,
+            acc_frag_2,
+            acc_frag_2_half,
+            twiddle_32_idft_frag,
+            wmma::mem_col_major);
+        // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k_idx < 2) {
+        //   printf("After 2nd iDFT in the conv, %d\n", k_idx);
+        //   for (int i = 0; i < 8; i++) {
+        //     a_idx = i * num_threads + thread_id + k_idx_offset;
+        //     printf("%f + %fi, ", __nv_bfloat162float(a_real[a_idx]), __nv_bfloat162float(a_imag[a_idx]));
+        //   }
+        //   printf("\n");
+        // }
+        // __syncthreads();
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //   printf("After inner conv\n");
+      //   for (int i = 0; i < items_per_thread_input; i++) {
+      //     a_idx = i * num_threads + thread_id;
+      //     printf("%f + %fi, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])), __bfloat162float(__nv_bfloat16(a_imag[a_idx])));
+      //   }
+      //   printf("\n");
+      // }
+      // 1024 / 16 = 64
+      for (int k_idx = 0; k_idx < 64 / WARP_TILE_SIZE; k_idx++)
+      {
+        // k_idx_offset = k_idx * DFT_SIZE + warp_id * (16 / WARP_TILE_SIZE) * DFT_SIZE;
+        k_idx_offset = k_idx * WARP_TILE_SIZE * sqrt_N_1 + warp_id * sqrt_N_1;
+        // outer DFT
+        complex_matmul_c2r_1024<wmma::col_major, wmma::row_major, true, true, MATMUL_WARP_WIDTH_1, false, true>(
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_imag + k_idx_offset), // this is the input
+            reinterpret_cast<__nv_bfloat16 *>(a_real + k_idx_offset), // write to SRAM
+            sqrt_N_1,
+            N,
+            b_frag_idft_N_1,
+            acc_frag_1,
+            acc_frag_1_half,
+            twiddle_1024_idft_frag[k_idx],
+            wmma::mem_col_major);
+      }
+      __syncthreads();
+      // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+      //    printf("Before output\n");
+      //    for (int i = 0; i < items_per_thread_input; i++) {
+      //       a_idx = i * num_threads + thread_id;
+      //       printf("%f, ", __bfloat162float(__nv_bfloat16(a_real[a_idx])));
+      //    }
+      //    printf("\n");
+      // }
+      #pragma unroll
+      for (int i = 0; i < items_per_thread_input / 2; i++)
+      {
+        a_idx = i * num_threads + thread_id;
+        if(out_gate != nullptr){
+          reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i] = __hmul2(
+            reinterpret_cast<__nv_bfloat162 *>(gate_data)[i],
+            reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx]
+          );
+        }else{
+          reinterpret_cast<__nv_bfloat162 *>(x_input_data)[i] = reinterpret_cast<__nv_bfloat162 *>(a_real)[a_idx];
+        }
+      }
+      // HACK
+      // for now, just output the a_real output
+      BlockStore_Sequence().Store(
+          reinterpret_cast<float *>(out + input_offset),
+          reinterpret_cast<float(&)[items_per_thread_input / 2]>(x_input_data),
+          signal_size / 2
+      );
+      __syncthreads();
+    } // b_tile_id
+  }   // h_tile_id
+}