Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

icarus112 commited on 6 days ago

Commit

c475135

verified ·

1 Parent(s): a0ab607

Update Feather a10g-large training runtime image

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +16 -0
.guardian_trigger_20260512_211050 +1 -0
.rebuild_sentry +1 -0
FORCE_REBUILD +3 -0
README.md +6 -5
REBUILD_FLAG_1778645488 +0 -0
entrypoint.py +1 -20
overlay/.dockerignore +20 -0
overlay/BUILD_STAMP +1 -0
overlay/harness/benchmark_validity.py +210 -0
overlay/harness/tps_manifest_validity.py +209 -0
overlay/htm_rust/.cargo/config.toml +2 -0
overlay/htm_rust/.claude/CLAUDE.md +0 -0
overlay/htm_rust/.letta/claude/conversations.json +6 -0
overlay/htm_rust/.letta/claude/session-c892b9c9-7fe5-4f14-8157-ec8740e965d1.json +0 -0
overlay/htm_rust/Cargo.lock +42 -0
overlay/htm_rust/Cargo.toml +3 -1
overlay/htm_rust/DLB_PERKS_IMPLEMENTATION_PLAN.md +194 -0
overlay/htm_rust/bench_gpu.py +81 -0
overlay/htm_rust/docs/GPU_HTM.md +302 -0
overlay/htm_rust/src/gpu/fused.rs +58 -10
overlay/htm_rust/src/gpu/mod.rs +134 -1
overlay/htm_rust/src/lib.rs +27 -0
overlay/htm_rust/src/region.rs +2 -0
overlay/htm_rust/src/sp.rs +5 -1
overlay/htm_rust/src/tm.rs +6 -2
overlay/htm_rust/uv.lock +8 -0
overlay/hydra/model.py +96 -8
overlay/hydra/optimizer.py +118 -44
overlay/hydra/training.py +66 -25
overlay/kernels/__init__.py +0 -0
overlay/kernels/cuda/decode_kernels.cu +10 -0
overlay/kernels/cuda/flashfftconv/LICENSE +201 -0
overlay/kernels/cuda/flashfftconv/README.md +57 -0
overlay/kernels/cuda/flashfftconv/UPSTREAM_COMMIT +1 -0
overlay/kernels/cuda/flashfftconv/csrc/.gitignore +10 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly.h +374 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda.cu +699 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda_bf16.cu +725 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda.cu +723 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda_bf16.cu +705 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda.cu +871 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda_bf16.cu +897 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda.cu +905 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda_bf16.cu +917 -0
overlay/kernels/cuda/flashfftconv/csrc/butterfly/shared.h +60 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d.h +96 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bhl.cu +132 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_blh.cu +202 -0
overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bwd_cuda_bhl.cu +106 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,16 @@

+# Keep HF runtime image context deterministic and small.
+**/__pycache__/
+**/*.py[cod]
+**/.pytest_cache/
+**/.mypy_cache/
+**/.ruff_cache/
+**/.venv/
+**/target/
+**/logs/
+**/*.log
+**/*.out
+**/*.pt
+**/*.safetensors
+**/*.parquet
+**/*.npz
+**/.git/

.guardian_trigger_20260512_211050 ADDED Viewed

	@@ -0,0 +1 @@


1	+ Guardian forced rebuild at 2026-05-12T21:10:50.366196

.rebuild_sentry ADDED Viewed

	@@ -0,0 +1 @@


1	+ FORCE_REBUILD_e9883655-cf86-4724-84bd-68740a3feefb

FORCE_REBUILD ADDED Viewed

	@@ -0,0 +1,3 @@

+FORCE_SPACE_REBUILD=$(date -u +%s)
+# This flag forces the Space image to rebuild with the latest overlay code
+# containing the retina_contrastive fix

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: Feather A10g Large Runtime
-emoji: 🌍
-colorFrom: pink
-colorTo: pink
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Feather H200 Runtime Slim
+emoji: 📚
+colorFrom: blue
+colorTo: indigo
 sdk: docker
+app_port: 7860
 pinned: false
 ---
+Feather runtime image used as a Docker Space source for Hugging Face Jobs.

REBUILD_FLAG_1778645488 ADDED Viewed

File without changes

entrypoint.py CHANGED Viewed

@@ -217,25 +217,6 @@ def _run_training_subprocess(cmd: list[str]) -> int:
 def run_job_mode() -> int:
     os.chdir(REPO_ROOT)
-    # Guardian: force contrastive_rank=0 and disk-patch sdr_semantic.py
-    os.environ["HYDRA_CONTRASTIVE_RANK"] = "0"
-    _sdr_path = REPO_ROOT / 'subsystems' / 'sdr_semantic.py'
-    if _sdr_path.exists():
-        _text = _sdr_path.read_text()
-        if 'retina_contrastive' not in _text:
-            print('[guardian] patching sdr_semantic.py on disk ...', flush=True)
-            _text = _text.replace(
-                'super().__init__()\n' +
-                '        # Audit 2026-05-13: allow disabling',
-                'super().__init__()\n' +
-                '        self.retina_contrastive = None  # guardian patch\n' +
-                '        # Audit 2026-05-13: allow disabling',
-            )
-            _sdr_path.write_text(_text)
-            print('[guardian] patched sdr_semantic.py on disk', flush=True)
-        print('[guardian] HYDRA_CONTRASTIVE_RANK=0 enforced for checkpoint compat', flush=True)
     # Dynamic live patch from GitHub to bypass Space build errors
     GIT_REF = os.environ.get('FEATHER_GIT_REF')
     if GIT_REF:
@@ -307,4 +288,4 @@ def main() -> int:
 if __name__ == '__main__':
-    raise SystemExit(main())

 def run_job_mode() -> int:
     os.chdir(REPO_ROOT)
     # Dynamic live patch from GitHub to bypass Space build errors
     GIT_REF = os.environ.get('FEATHER_GIT_REF')
     if GIT_REF:
 if __name__ == '__main__':
+    raise SystemExit(main())

overlay/.dockerignore ADDED Viewed

	@@ -0,0 +1,20 @@

+.git
+.github
+.venv
+.remember
+.letta
+.claude
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+run_*.log
+run*.log
+*.txt
+WORKER_COMPLETE
+autoresearch_loop.log
+data/
+state_store/
+htm_rust/target/
+hydra-core/target/

overlay/BUILD_STAMP ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1778646814_120314

overlay/harness/benchmark_validity.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Benchmark validity and comparable-group helpers for HYDRA scorecards.
+This module deliberately separates benchmark validity from model quality. A run
+can be useful diagnostic evidence while still being invalid for promotion if its
+corpus or eval protocol differs from the baseline.
+"""
+from __future__ import annotations
+import hashlib
+import json
+from copy import deepcopy
+from typing import Any
+PUBLIC_FULL_BLEND_ID = "public_full_blend_v0"
+PUBLIC_FULL_BLEND_WEIGHTS = {
+    "fineweb-edu": 0.55,
+    "wikipedia": 0.25,
+    "cosmopedia": 0.15,
+    "fineweb": 0.05,
+}
+GATED_OR_PRIVATE_MARKERS = (
+    "stack-v2",
+    "nemotron-math",
+    "nemotron-specialized",
+    "nvidia/nemotron",
+    "Nemotron-CC-Math",
+    "Nemotron-Pretraining-Specialized",
+)
+def _text_blob(row: dict[str, Any]) -> str:
+    return json.dumps(row, sort_keys=True, default=str)
+def _ablation(row: dict[str, Any]) -> dict[str, Any]:
+    ablation = row.get("ablation")
+    return ablation if isinstance(ablation, dict) else {}
+def _has_public_full_blend(row: dict[str, Any]) -> bool:
+    ablation = _ablation(row)
+    corpus_profile = str(row.get("corpus_profile") or "").lower()
+    corpus_standard = str(ablation.get("corpus_standard") or row.get("corpus_standard") or "").lower()
+    notes = str(row.get("notes") or "").lower()
+    blend_weights = row.get("full_blend_weights")
+    single_config = str(
+        ablation.get("HYDRA_NEMOTRON_SINGLE_CONFIG")
+        or row.get("HYDRA_NEMOTRON_SINGLE_CONFIG")
+        or ""
+    ).strip().lower()
+    has_full_blend_marker = (
+        row.get("HYDRA_USE_FULL_BLEND") == "1"
+        or row.get("HYDRA_USE_FULL_BLEND") == 1
+        or row.get("HYDRA_USE_FULL_BLEND") is True
+        or "hydra_use_full_blend=1" in corpus_standard
+        or corpus_profile == PUBLIC_FULL_BLEND_ID
+        or blend_weights == PUBLIC_FULL_BLEND_WEIGHTS
+        or "public benchmark blend" in corpus_standard
+        or "public full-blend" in notes
+        or "full-blend eval settings" in notes
+    )
+    single_config_is_blank = single_config in {"", "<unset>", "none", "null"}
+    return bool(has_full_blend_marker and single_config_is_blank)
+def _uses_private_or_gated_corpus(row: dict[str, Any]) -> bool:
+    blob = _text_blob(row).lower()
+    return any(marker.lower() in blob for marker in GATED_OR_PRIVATE_MARKERS)
+def _eval_tokens(row: dict[str, Any]) -> int | None:
+    raw = row.get("eval_tokens")
+    if raw in (None, ""):
+        return None
+    try:
+        return int(raw)
+    except (TypeError, ValueError):
+        return None
+def _eval_batch(row: dict[str, Any]) -> int | None:
+    raw = row.get("eval_batch", 1)
+    if raw in (None, ""):
+        return None
+    try:
+        return int(raw)
+    except (TypeError, ValueError):
+        return None
+def _eval_protocol(row: dict[str, Any]) -> str:
+    val_source = str(row.get("val_source") or "").lower()
+    row_type = str(row.get("type") or "").lower()
+    if "fresh_checkpoint_eval" in val_source or "fresh_checkpoint_eval" in row_type:
+        return "fresh_checkpoint_eval"
+    if "in_process" in val_source or "in_process" in row_type:
+        return "in_process_eval"
+    return val_source or row_type or "unknown_eval"
+def _gpu_flavor(row: dict[str, Any]) -> str:
+    return str(row.get("gpu_flavor") or row.get("FEATHER_HF_FLAVOR") or "a10g-large").lower()
+def _runtime_profile(row: dict[str, Any]) -> str:
+    return str(
+        row.get("runtime_profile")
+        or row.get("FEATHER_HF_RUNTIME_PROFILE")
+        or "a10-compromise-telemetry"
+    ).lower()
+def benchmark_invalid_reason(row: dict[str, Any]) -> str:
+    """Return an empty string when a row is benchmark-valid."""
+    if row.get("crashed") is True:
+        return "run crashed"
+    if row.get("metrics_write_failed") is True and row.get("val_bpb") in (None, 0, 0.0):
+        return "metrics missing or failed"
+    val_bpb = row.get("val_bpb")
+    try:
+        if val_bpb is None or float(val_bpb) <= 0:
+            return "missing positive val_bpb"
+    except (TypeError, ValueError):
+        return "missing positive val_bpb"
+    if not _has_public_full_blend(row):
+        return "not public full blend / full blend invariant missing"
+    if _uses_private_or_gated_corpus(row):
+        return "uses private/gated corpus marker"
+    if _eval_tokens(row) is None:
+        return "missing eval_tokens"
+    if _eval_batch(row) is None:
+        return "missing eval_batch"
+    if _eval_protocol(row) != "fresh_checkpoint_eval":
+        return "not fresh checkpoint eval"
+    return ""
+def comparable_group_id(row: dict[str, Any]) -> str:
+    """Build a stable comparable-group identifier from protocol fields only.
+    Deliberately excludes checkpoint/model/ablation identities so architecture
+    variants can be compared when corpus and eval protocol match.
+    """
+    parts = {
+        "corpus": PUBLIC_FULL_BLEND_ID if _has_public_full_blend(row) else "non_public_or_unknown_corpus",
+        "eval_protocol": _eval_protocol(row),
+        "eval_tokens": _eval_tokens(row),
+        "eval_batch": _eval_batch(row),
+        "gpu_flavor": _gpu_flavor(row),
+        "runtime_profile": _runtime_profile(row),
+    }
+    digest = hashlib.sha1(json.dumps(parts, sort_keys=True).encode()).hexdigest()[:10]
+    return "cmp_" + digest
+def normalize_scorecard_row(row: dict[str, Any]) -> dict[str, Any]:
+    """Return a row copy annotated with v0 benchmark validity metadata."""
+    normalized = deepcopy(row)
+    invalid_reason = benchmark_invalid_reason(normalized)
+    normalized["benchmark_valid"] = not invalid_reason
+    normalized["benchmark_status"] = "comparable" if not invalid_reason else "diagnostic"
+    normalized["invalid_reason"] = invalid_reason
+    normalized["corpus_profile"] = PUBLIC_FULL_BLEND_ID if _has_public_full_blend(normalized) else "non_public_or_unknown"
+    normalized["full_blend_weights"] = PUBLIC_FULL_BLEND_WEIGHTS if _has_public_full_blend(normalized) else None
+    normalized["eval_tokens"] = _eval_tokens(normalized)
+    normalized["eval_batch"] = _eval_batch(normalized)
+    normalized["eval_protocol"] = _eval_protocol(normalized)
+    normalized["gpu_flavor"] = _gpu_flavor(normalized)
+    normalized["runtime_profile"] = _runtime_profile(normalized)
+    normalized["comparable_group_id"] = comparable_group_id(normalized)
+    return normalized
+def are_comparable(left: dict[str, Any], right: dict[str, Any]) -> bool:
+    left_n = normalize_scorecard_row(left)
+    right_n = normalize_scorecard_row(right)
+    return bool(
+        left_n["benchmark_valid"]
+        and right_n["benchmark_valid"]
+        and left_n["comparable_group_id"] == right_n["comparable_group_id"]
+    )
+def compare_candidate(candidate: dict[str, Any], baseline: dict[str, Any]) -> dict[str, Any]:
+    """Compare two scorecard rows with validity-first promotion semantics."""
+    candidate_n = normalize_scorecard_row(candidate)
+    baseline_n = normalize_scorecard_row(baseline)
+    if not candidate_n["benchmark_valid"]:
+        return {"decision": "invalid_candidate", "reason": candidate_n["invalid_reason"]}
+    if not baseline_n["benchmark_valid"]:
+        return {"decision": "invalid_baseline", "reason": baseline_n["invalid_reason"]}
+    if candidate_n["comparable_group_id"] != baseline_n["comparable_group_id"]:
+        return {
+            "decision": "not_comparable",
+            "reason": (
+                "comparable_group_id mismatch: "
+                f"candidate={candidate_n['comparable_group_id']} "
+                f"baseline={baseline_n['comparable_group_id']}"
+            ),
+        }
+    delta_bpb = float(candidate_n["val_bpb"]) - float(baseline_n["val_bpb"])
+    if delta_bpb < 0:
+        decision = "promote_candidate"
+    elif delta_bpb > 0:
+        decision = "keep_baseline"
+    else:
+        decision = "tie_requires_replication"
+    return {"decision": decision, "delta_bpb": delta_bpb, "reason": "same comparable_group_id"}

overlay/harness/tps_manifest_validity.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""TPS/profiling manifest validity helpers for Feather kernel-fusion sweeps.
+This module is the TPS-side sibling of ``harness.benchmark_validity``. It does
+not decide model quality; it decides whether a row is valid evidence for max-TPS
+promotion versus attribution/diagnostic evidence. The rules are intentionally
+conservative because profiling flags and CPU fallbacks can make fast-looking rows
+incomparable or unfaithful.
+"""
+from __future__ import annotations
+from copy import deepcopy
+from typing import Any
+A10_FLAVORS = {"a10g-small", "a10g-large", "a10g-largex2", "a10g-largex4"}
+PROFILE_TRUE = {"1", "true", "yes", "on"}
+PROFILE_FALSE = {"0", "false", "no", "off", ""}
+def _as_bool(value: Any, *, default: bool = False) -> bool:
+    if isinstance(value, bool):
+        return value
+    if value is None:
+        return default
+    text = str(value).strip().lower()
+    if text in PROFILE_TRUE:
+        return True
+    if text in PROFILE_FALSE:
+        return False
+    return default
+def _int_or_none(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+def _float_or_none(value: Any) -> float | None:
+    if value in (None, ""):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+def _nested(row: dict[str, Any], key: str) -> dict[str, Any]:
+    value = row.get(key)
+    return value if isinstance(value, dict) else {}
+def _env(row: dict[str, Any]) -> dict[str, Any]:
+    return _nested(row, "env")
+def _receipts(row: dict[str, Any]) -> dict[str, Any]:
+    return _nested(row, "receipts") or _nested(row, "receipts_required")
+def _hardware(row: dict[str, Any]) -> dict[str, Any]:
+    return _nested(row, "hardware")
+def _profile_forward_enabled(row: dict[str, Any]) -> bool:
+    env = _env(row)
+    receipts = _receipts(row)
+    if "profile_forward" in receipts:
+        return _as_bool(receipts.get("profile_forward"))
+    return _as_bool(env.get("HYDRA_PROFILE_FORWARD"))
+def _tps_window(row: dict[str, Any]) -> dict[str, Any]:
+    receipts = _receipts(row)
+    window = receipts.get("training_tps_window") or row.get("training_tps_window") or row.get("tps_window")
+    return window if isinstance(window, dict) else {}
+def _median_tps(row: dict[str, Any]) -> float | None:
+    window = _tps_window(row)
+    return _float_or_none(window.get("median") or row.get("median_tps") or row.get("tps"))
+def _flavor(row: dict[str, Any]) -> str:
+    hardware = _hardware(row)
+    receipts = _receipts(row)
+    return str(
+        hardware.get("flavor")
+        or receipts.get("flavor_verified")
+        or row.get("gpu_flavor")
+        or row.get("FEATHER_HF_FLAVOR")
+        or ""
+    ).strip().lower()
+def _duplicate_count(row: dict[str, Any]) -> int | None:
+    check = row.get("duplicate_active_job_check")
+    if not isinstance(check, dict):
+        return None
+    return _int_or_none(check.get("active_matching_jobs"))
+def _scale_free_a10g_invalid_reasons(row: dict[str, Any]) -> list[str]:
+    """Return fail-closed reasons for bounded A10G scale-free HTM proof rows."""
+    env = _env(row)
+    reasons: list[str] = []
+    if _flavor(row) not in A10_FLAVORS:
+        return reasons
+    proof_requested = (
+        _as_bool(env.get("HYDRA_HTM_STRICT_SCALE_FREE"), default=False)
+        or str(row.get("runtime_profile") or "").strip().lower() in {"optimal-strict", "a10g-scale-free-proof"}
+    )
+    if not proof_requested:
+        return reasons
+    if env.get("HYDRA_TARGET_SHARDS") not in {"0", 0}:
+        reasons.append("scale-free A10G proof requires HYDRA_TARGET_SHARDS=0")
+    if env.get("HYDRA_HTM_STRICT_SCALE_FREE") != "1":
+        reasons.append("scale-free A10G proof requires HYDRA_HTM_STRICT_SCALE_FREE=1")
+    region_pool = _int_or_none(env.get("HYDRA_HTM_REGION_POOL_SIZE"))
+    chunk_b = _int_or_none(env.get("HYDRA_HTM_CHUNK_B"))
+    if region_pool is None:
+        reasons.append("scale-free A10G proof requires HYDRA_HTM_REGION_POOL_SIZE")
+    elif region_pool > 4:
+        reasons.append("scale-free A10G proof requires HYDRA_HTM_REGION_POOL_SIZE<=4")
+    if chunk_b is None:
+        reasons.append("scale-free A10G proof requires HYDRA_HTM_CHUNK_B")
+    elif region_pool is not None and chunk_b > region_pool:
+        reasons.append("scale-free A10G proof requires HYDRA_HTM_CHUNK_B<=HYDRA_HTM_REGION_POOL_SIZE")
+    if env.get("HYDRA_TOKEN_CACHE_GB") not in {"0", 0}:
+        reasons.append("scale-free A10G proof requires HYDRA_TOKEN_CACHE_GB=0")
+    if env.get("HYDRA_DISABLE_TOKEN_CACHE") != "1":
+        reasons.append("scale-free A10G proof requires HYDRA_DISABLE_TOKEN_CACHE=1")
+    for key in (
+        "HYDRA_HTM_REGION_POOL_SIZE_FROM_VRAM",
+        "HYDRA_HTM_SCALE_TO_VRAM",
+        "HYDRA_VRAM_TOPOLOGY_SCALE",
+        "FEATHER_VRAM_TOPOLOGY_SCALE",
+    ):
+        if _as_bool(env.get(key), default=False):
+            reasons.append(f"scale-free A10G proof forbids VRAM-derived topology scaling: {key}")
+    return reasons
+def tps_manifest_invalid_reasons(row: dict[str, Any]) -> list[str]:
+    """Return all reasons a row cannot be used as max-TPS promotion evidence."""
+    reasons: list[str] = []
+    env = _env(row)
+    receipts = _receipts(row)
+    flavor = _flavor(row)
+    if row.get("crashed") is True:
+        reasons.append("run crashed")
+    if flavor not in A10_FLAVORS:
+        reasons.append(f"not A10G flavor: {flavor or 'missing'}")
+    if _profile_forward_enabled(row):
+        reasons.append("profile_forward enabled; attribution-only overhead row")
+    if _median_tps(row) is None:
+        reasons.append("missing training TPS window median")
+    duplicate_count = _duplicate_count(row)
+    if duplicate_count is None:
+        reasons.append("duplicate active job check missing")
+    elif duplicate_count > 0:
+        reasons.append(f"duplicate active Feather A10G jobs present: {duplicate_count}")
+    faithful_profile = "faithful" in str(row.get("runtime_profile") or "").lower()
+    htm_gpu_verified = _as_bool(receipts.get("htm_gpu_verified"), default=False)
+    force_htm_cpu = _as_bool(env.get("HYDRA_FORCE_HTM_CPU"), default=False)
+    if faithful_profile and (force_htm_cpu or not htm_gpu_verified):
+        reasons.append("faithful row lacks HTM GPU verification or uses CPU fallback")
+    if faithful_profile and env.get("HYDRA_HTM_FUSED") != "1":
+        reasons.append("faithful row missing HYDRA_HTM_FUSED=1")
+    if faithful_profile and env.get("HYDRA_HTM_BATCHED_FUSED") != "1":
+        reasons.append("faithful row missing HYDRA_HTM_BATCHED_FUSED=1")
+    if _as_bool(env.get("HYDRA_USE_NEMOTRON"), default=False) and env.get("HYDRA_TARGET_SHARDS") not in {"0", 0}:
+        reasons.append("Nemotron streaming TPS row must use HYDRA_TARGET_SHARDS=0")
+    if env.get("HYDRA_TOKEN_CACHE_GB") not in {"0", 0, None}:
+        reasons.append("token cache enabled/materializing during TPS row")
+    reasons.extend(_scale_free_a10g_invalid_reasons(row))
+    return reasons
+def tps_manifest_invalid_reason(row: dict[str, Any]) -> str:
+    return "; ".join(tps_manifest_invalid_reasons(row))
+def normalize_tps_manifest(row: dict[str, Any]) -> dict[str, Any]:
+    """Return a copy annotated with TPS/profiling validity metadata."""
+    normalized = deepcopy(row)
+    reasons = tps_manifest_invalid_reasons(normalized)
+    profile_forward = _profile_forward_enabled(normalized)
+    normalized["tps_valid"] = not reasons
+    if not reasons:
+        status = "promotion_candidate"
+    elif profile_forward or str(normalized.get("metric_role") or "").lower() == "profile":
+        status = "attribution_only"
+    else:
+        status = "diagnostic"
+    normalized["tps_status"] = status
+    normalized["invalid_reason"] = "; ".join(reasons)
+    normalized["gpu_flavor"] = _flavor(normalized)
+    normalized["median_tps"] = _median_tps(normalized)
+    normalized["profile_forward"] = profile_forward
+    normalized["duplicate_active_job_count"] = _duplicate_count(normalized)
+    return normalized

overlay/htm_rust/.cargo/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [target.x86_64-unknown-linux-gnu]
2	+ linker = "/usr/bin/cc"

overlay/htm_rust/.claude/CLAUDE.md ADDED Viewed

The diff for this file is too large to render. See raw diff

overlay/htm_rust/.letta/claude/conversations.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "c892b9c9-7fe5-4f14-8157-ec8740e965d1": {
+    "conversationId": "conv-b42ddc79-3745-4edf-b165-4281a8961d3b",
+    "agentId": "agent-2cc00bdf-45f5-4725-bb56-7b4ab142153e"
+  }
+}

overlay/htm_rust/.letta/claude/session-c892b9c9-7fe5-4f14-8157-ec8740e965d1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

overlay/htm_rust/Cargo.lock CHANGED Viewed

@@ -8,6 +8,15 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -44,12 +53,14 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 name = "htm_rust"
 version = "0.1.0"
 dependencies = [
  "cudarc",
  "ndarray",
  "numpy",
  "pyo3",
  "rand",
  "rand_xoshiro",
 ]
 [[package]]
@@ -301,6 +312,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
 dependencies = [
  "rand_core",
 ]
 [[package]]
@@ -321,6 +333,36 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 [[package]]
 name = "syn"
 version = "2.0.117"

 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
 name = "htm_rust"
 version = "0.1.0"
 dependencies = [
+ "bincode",
  "cudarc",
  "ndarray",
  "numpy",
  "pyo3",
  "rand",
  "rand_xoshiro",
+ "serde",
 ]
 [[package]]
 checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
 dependencies = [
  "rand_core",
+ "serde",
 ]
 [[package]]
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
 [[package]]
 name = "syn"
 version = "2.0.117"

overlay/htm_rust/Cargo.toml CHANGED Viewed

@@ -15,7 +15,9 @@ pyo3 = { version = "0.22", features = ["extension-module"] }
 numpy = "0.22"
 ndarray = "0.16"
 rand = "0.8"
-rand_xoshiro = "0.6"
 # cudarc: CUDA Rust bindings with dynamic-loading (no link-time dep on libcuda).
 # Kernels are embedded as PTX and JIT-compiled at runtime.
 cudarc = { version = "0.12", default-features = false, features = ["dynamic-linking", "driver", "cuda-12010"], optional = true }

 numpy = "0.22"
 ndarray = "0.16"
 rand = "0.8"
+rand_xoshiro = { version = "0.6", features = ["serde1"] }
+serde = { version = "1", features = ["derive"] }
+bincode = "1.3"
 # cudarc: CUDA Rust bindings with dynamic-loading (no link-time dep on libcuda).
 # Kernels are embedded as PTX and JIT-compiled at runtime.
 cudarc = { version = "0.12", default-features = false, features = ["dynamic-linking", "driver", "cuda-12010"], optional = true }

overlay/htm_rust/DLB_PERKS_IMPLEMENTATION_PLAN.md ADDED Viewed

	@@ -0,0 +1,194 @@

+# HTM-on-H200 Performance Plan: Persistent Kernel + Hopper Cluster mbarrier
+**Goal:** Drive HTM forward from 400ms → ~40-80ms (5-10×) → tps 38k → 200-400k
+**Hardware:** NVIDIA H200, 132 SMs, sm_90a, CUDA 12.4+
+---
+## The Real Bottleneck (established)
+```
+Current batched cooperative kernel (grid=(16,8,1)=128 blocks):
+  htm_launch = 400-440 ms  ← hard wall
+  tps        = 35-38 k
+```
+**Why we can't beat it with cooperative launch:**
+- Cooperative kernels serialize at the device level (1 cooperative kernel at a time).
+- H200 grid cap = 132 blocks (1 block/SM at block=1024). For B=8 regions batched: 16 blocks/region ceiling.
+- Work × grid = constant: reshuffling blocks doesn't help.
+**Why software DLB barrier made it worse (measured 650ms, 23k tps):**
+- 128 blocks × 3 barriers/timestep × 2048 timesteps × ~5-10µs coordinator poll = ~300ms pure overhead.
+- L2-contention tax (documented 20× slowdown on H200 vs 3060 for software atomic spin).
+**The two paths that actually scale on H200 (per research):**
+| Path | Pattern | Expected |
+|------|---------|----------|
+| **A** | PERKS-style persistent kernel + in-kernel turnstile | 1.3–1.8× = ~280-330 ms |
+| **B** | Hopper Cluster mbarrier (hardware sync + TMA multicast) | 5–10× = ~40-80 ms |
+Path B wins. It uses *hardware* primitives that match cooperative launch's speed while not being subject to the device-level serialization.
+---
+## Architecture: Cluster-Mapped HTM (Design 2 from research)
+**Mapping:** Each of our 8 HTM regions → one Hopper Thread Block Cluster of 16 SMs
+- Cluster size: 16 blocks (= current per-region grid_x)
+- Total: 8 clusters × 16 SMs = 128 SMs used, 4 SMs spare
+- Grid launch: `grid = (16, 8, 1)`, `cluster = (16, 1, 1)` — batched identically to today but with `CUDA_CLUSTER` launch attribute
+**Per-cluster sync primitives (replace grid.sync()):**
+1. **Intra-cluster barrier:** `cluster::sync()` — hardware-level, ~10-40 ns (vs software atomic ~100-500 ns)
+2. **Cluster-distributed shared memory:** each SM in cluster can directly `cuda::memcpy_async` from another SM's smem
+3. **TMA multicast (`cp.async.bulk.tensor ... multicast`):** one TMA descriptor propagates input SDRs / column activations to all 16 SMs in cluster in a single DMA
+**Between clusters (8 regions):** independent — each region updates its own state and its own cluster's mbarriers. Multiple clusters run concurrently at hardware-scheduler level, bounded only by SM count (fits because 8 × 16 = 128 ≤ 132).
+**Inside the kernel body:** T=2048 timesteps run in a persistent loop. Hot state (boost, active_duty, inhibition_threshold, cell_active/winner bitsets) stays in registers / cluster-shared smem across timesteps — no per-timestep DRAM round-trip.
+---
+## Task Plan (Detailed, Dependency-Ordered)
+### Phase 1 — Feasibility & Setup (no GPU risk)
+**T1. Cluster launch feasibility probe**
+- Query `cuDeviceGetAttribute` for `CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR` and `CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH`
+- Verify H200 supports cluster launch with `cluster_size=16`
+- Source: `cudarc::driver::result::launch_kernel_ex` with `CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION`
+- Files: `htm_rust/src/gpu/fused.rs` — add probe at FusedState::new
+**T2. Enable sm_90a PTX compilation + `--device-c` for rdc link**
+- Current build.rs targets `sm_90`. Need `sm_90a` to access cluster intrinsics
+- Add `-arch=sm_90a -rdc=true` to nvcc invocation
+- Files: `htm_rust/build.rs`
+**T3. Update cudarc version to 0.12 minimum**
+- Current 0.12. Verify `result::launch_kernel_ex` and `CUkernelNodeAttrValue` are available
+- If not, upgrade to latest 0.13+
+- Files: `htm_rust/Cargo.toml`
+### Phase 2 — Cluster mbarrier primitive (isolated, testable)
+**T4. Rewrite `fused_grid_barrier` as cluster barrier**
+- Replace my DLB software barrier + `cg::grid_group::sync()` with:
+  ```cpp
+  namespace cg = cooperative_groups;
+  auto cluster = cg::this_cluster();   // sm_90a intrinsic
+  cluster.sync();                       // hardware barrier
+  ```
+- No more `flags[]` array, no spin-wait, no `__nanosleep`
+- Files: `htm_rust/src/gpu/kernels/htm_fused_step.cu:117-160`
+- Reference: CUTLASS `include/cutlass/pipeline/sm90_pipeline.hpp`
+**T5. Delete `barrier_counters` allocation + plumbing**
+- No longer needed with cluster barrier
+- Files: `htm_rust/src/gpu/fused.rs` — remove `barrier_counters` field, FusedPtrs field, alloc
+**T6. Unit test cluster sync on minimal kernel**
+- Write a standalone test kernel that just does: load input, cluster::sync(), write output
+- Launch with `cluster_dim=(16,1,1)`, `grid=(16,1,1)`, `block=(1024,1,1)`
+- Verify no deadlock, correct values
+- Files: `htm_rust/src/gpu/tests.rs`
+### Phase 3 — Persistent in-kernel timestep loop
+**T7. Move T=2048 loop inside kernel body**
+- Currently the T loop is inside the kernel already (`for (t = 0; t < cfg.T; t++)` at line 176)
+- Persistent pattern means the SAME kernel processes all 2048 steps without relaunch
+- Already the case! Just verify with cluster barrier replacing grid.sync
+**T8. Cache hot state in cluster-distributed shared memory**
+- Move `inhibition_threshold[n_columns]` from GMEM to cluster smem (16 SMs × 48KB = 768KB available per cluster)
+- With n_columns=2048 and f32 = 8KB per cluster — trivially fits
+- Similarly cache `boost[n_columns]` (8KB) and `active_duty[n_columns]` (8KB)
+- Each SM in cluster holds a slice; reads from peer SM via `cuda::memcpy_async` with cluster scope
+- Files: kernel `htm_fused_step_body`
+- Reference: CUTLASS cluster shmem examples in `examples/49_hopper_gemm_with_collective_builder`
+**T9. TMA multicast for per-timestep input broadcast**
+- Each timestep broadcasts the current SDR input + prev column-activation state to all 16 SMs in cluster
+- Use `cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster`
+- Single DMA instead of 16 blocks each reading from GMEM
+- Files: kernel, plus set up `CUtensorMap` descriptors in Rust host
+- Reference: [CUDA TMA multicast docs](https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html)
+### Phase 4 — Rust host update
+**T10. Switch launch to `launch_kernel_ex` with cluster attribute**
+- Current: `result::launch_kernel(func, grid, block, shmem, stream, params)`
+- New: `launch_kernel_ex(func, grid, cluster, block, shmem, stream, params, attrs)`
+- Cluster attribute: `CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` = `(16, 1, 1)`
+- Files: `htm_rust/src/gpu/fused.rs` — both `launch_fused` and `launch_fused_batched_raw`
+**T11. Allocate cluster-scope CUtensorMap descriptors**
+- One per region for input SDR, cols_out, anom_out
+- Rust side: `cuTensorMapEncodeTiled` with appropriate swizzling
+- Files: `htm_rust/src/gpu/fused.rs` — FusedState::new extended with tensor maps
+**T12. Bump MAX_REGISTERS / occupancy**
+- With cluster + persistent kernel, register budget per thread tightens
+- May need `__launch_bounds__(1024, 2)` to force 2 blocks/SM
+- Verify occupancy with `cudaOccupancyMaxActiveBlocksPerMultiprocessor`
+- Files: kernel, fused.rs
+### Phase 5 — Validation + measurement
+**T13. Parity test against current kernel**
+- Run both old (cooperative) and new (cluster) kernels with identical input, compare outputs bit-exact
+- Must match (HTM is deterministic given same seed)
+- Files: `tests.rs`
+**T14. Benchmark: measure PROFILE[htm_launch] + tps on H200**
+- Launch HF Job, verify steady-state tps
+- Target: ≥ 200k tps
+- If below, profile with Nsight Compute to find remaining stalls
+**T15. Document results + publish**
+---
+## Risks & Mitigations
+| Risk | Mitigation |
+|------|-----------|
+| H200 doesn't support cluster_size=16 | Fall back to cluster_size=8, use 2 clusters per region (16 SMs) |
+| Cluster barrier parity bug (deadlock) | Use CUDA-GDB's `info cuda barriers` (documented FA3 debug flow) |
+| TMA multicast descriptor setup complexity | Incremental: land cluster::sync() first (T4-T6), add TMA later (T9) |
+| Register pressure from in-kernel persistent state | Use `__launch_bounds__` + selective DRAM spill for cold state |
+| Cluster scheduling latency | Pre-build CUtensorMap once, reuse per forward call |
+---
+## Prior Art References
+- **PERKS** (closest structural analog): https://github.com/neozhang307/PERKS — persistent iterative kernel for stencils
+- **CUTLASS sm90 ping-pong**: https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+- **CUTLASS sm90 pipeline (mbarrier API)**: https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/pipeline/sm90_pipeline.hpp
+- **FlashAttention-3 hopper/**: https://github.com/Dao-AILab/flash-attention
+- **CuTe persistent kernels**: https://github.com/simveit/cute_persistent_kernels
+- **Hopper architecture guide**: https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/
+- **PERKS paper**: arXiv:2204.02064
+---
+## Expected Outcomes
+**Best case (all phases land):**
+- htm_launch: 400 ms → 40-60 ms
+- forward total: 410 ms → 50-70 ms
+- step time: 850 ms → 250-350 ms (bounded by backward + optimizer)
+- tps: 38k → ~**160-250k** — meets 200k target
+**Minimum case (only Phase 2, cluster sync without TMA multicast):**
+- htm_launch: 400 ms → 250-320 ms
+- tps: 38k → ~60-90k — partial win, still under 200k
+**Pessimistic (cluster launch has unexpected cap):**
+- Falls back to PERKS-style in-kernel turnstile (Design 1)
+- htm_launch: 400 ms → 280-360 ms
+- tps: 38k → ~55-75k

overlay/htm_rust/bench_gpu.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes.
+Usage:
+    source .venv/bin/activate
+    export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+    python htm_rust/bench_gpu.py
+"""
+import os
+import sys
+import time
+# Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports.
+_FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _FEATHER not in sys.path:
+    sys.path.insert(0, _FEATHER)
+import numpy as np
+import torch
+from subsystems.htm import HTMLayer
+def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float:
+    """Return mean ms/forward."""
+    for _ in range(warmup):
+        _ = layer(sdr)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    for _ in range(iters):
+        _ = layer(sdr)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    dt = time.perf_counter() - t0
+    return dt * 1000 / iters
+def main() -> None:
+    # HYDRA training config: B=8, T=2048, bits=16384, cols=2048.
+    B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384
+    n_cols = 2048
+    print(f"config: B={B} T={T} D={D} n_cols={n_cols}")
+    print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}")
+    # Build a fixed sparse SDR once.
+    rng = np.random.default_rng(0)
+    sdr = np.zeros((B, T, D), dtype=bool)
+    on = int(D * 0.02)
+    for b in range(B):
+        for t in range(T):
+            idx = rng.choice(D, size=on, replace=False)
+            sdr[b, t, idx] = True
+    sdr_t = torch.from_numpy(sdr)
+    # CPU baseline.
+    print("\n--- CPU ---")
+    cpu_layer = HTMLayer(
+        input_bits=D, n_columns=n_cols, cells_per_column=32,
+        batch_size=B, seed=42, use_gpu=False,
+    )
+    cpu_layer.train()
+    cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2)
+    print(f"CPU: {cpu_ms:.1f} ms/forward  ({cpu_ms/T:.2f} ms/step × T={T})")
+    # GPU.
+    print("\n--- GPU ---")
+    gpu_layer = HTMLayer(
+        input_bits=D, n_columns=n_cols, cells_per_column=32,
+        batch_size=B, seed=42, use_gpu=True,
+    )
+    gpu_layer.train()
+    sdr_cuda = sdr_t.cuda()
+    gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2)
+    print(f"GPU: {gpu_ms:.1f} ms/forward  ({gpu_ms/T:.2f} ms/step × T={T})")
+    print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x")
+if __name__ == "__main__":
+    main()

overlay/htm_rust/docs/GPU_HTM.md ADDED Viewed

	@@ -0,0 +1,302 @@

+# GPU HTM Backend
+## Status
+**FUSED MEGAKERNEL: entire T-timestep SP+TM forward collapsed into a single
+CUDA launch per forward pass.**
+* Legacy path: 12 kernels × T=2048 timesteps = 24K launches per forward.
+* Fused path: **1 launch per forward** (24000× launch-overhead reduction).
+* End-to-end training throughput: **~2.7k → ~60k tok/sec** (~22x speedup).
+* Fused path uses per-column threshold inhibition instead of global top-K
+  (see §Fused Kernel below — this is a real architectural change).
+## Fused Kernel
+### Why
+Global top-K column selection requires cross-block synchronization at every
+timestep. On WSL2/sm_86 without `-rdc=true`, `cooperative_groups::grid_sync()`
+is unreliable. Without a grid sync, collapsing the T-loop into one kernel is
+impossible, so every forward pays 12×T kernel launches and 90%+ of runtime is
+CUDA launch overhead + small-kernel tails.
+### How
+Replace global top-K with **per-column threshold activation**:
+    is_active[c] = (overlap[c] * boost[c]) > inhibition_threshold[c]
+`inhibition_threshold[c]` is a per-column scalar, learned via EMA update:
+    err = active_duty[c] - sparsity_target
+    new_thr = clamp(thr + thr_adapt_rate * err * 100, 0.1, 1000)
+This is biologically grounded (GABAergic local lateral inhibition in
+neocortical columns) and supported by HTM theory. The duty-cycle-driven
+feedback loop was already present; we simply redirect its output to drive
+activation threshold instead of multiplicative boost. The global top-K,
+which had no biological basis, is removed.
+### Cross-block coherence
+- **Ping-pong bitsets** for `cell_active_bits` and `cell_winner_bits`: at
+  even t write to `_a`, read from `_b`; at odd t reversed. This eliminates
+  the need for an in-place snapshot kernel between timesteps.
+- **Primary path: cooperative launch + hardware grid sync**. Host code probes
+  `CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH`, computes the cooperative whole-grid
+  residency limit from occupancy, and launches the fused megakernel with
+  `cuLaunchCooperativeKernel`. In-kernel barriers use
+  `cooperative_groups::this_grid().sync()`.
+- **Fallback path: software grid barrier** via a 3-slot atomic counter array
+  (`barrier_counters`). This remains as a compatibility fallback when
+  cooperative launch is unavailable.
+- **Launch invariant**: cooperative launch is capped to the hardware residency
+  limit for `blockDim.x = 1024`; software fallback remains capped conservatively
+  (`HTM_FUSED_GRID_CAP`, default 8) to avoid whole-grid spin deadlock.
+### Kernel structure
+```
+for t in 0..T:
+    # Phase 0: clear curr_active/curr_winner for my column range
+    grid_barrier()
+    # Phase A: SP overlap → boost → threshold → SP learn → duty + threshold EMA
+    grid_barrier()
+    # Phase B: TM predict (per cell, per seg) → TM learn (reinforce on match)
+    #                   → burst if none predicted → segment grow/reinforce
+    grid_barrier()
+    # Phase C: block 0 writes anomaly[t]
+```
+Each warp owns a contiguous slice of columns. At grid=24 blocks × 32 warps =
+768 warps, n_columns=2048 → 2-3 columns per warp.
+### Parity with legacy GPU path
+**Semantics diverge**. Legacy: exactly `k = round(sparsity * n_cols)` columns
+active per step. Fused: variable, converging to `sparsity * n_cols` on
+average via the per-column EMA. Anomaly decay on repeating sequences is
+preserved (see `gpu_fused_tm_anomaly_decays_on_repeating_sequence` test).
+This is an intentional architectural change committed under
+`no-bypass/full-architecture` per program.md rules. The legacy top-K path
+(`step_many_cuda`) remains available for reference and can be re-enabled via
+`HYDRA_HTM_FUSED=0`.
+### Tests
+- `gpu_threshold_converges_to_sparsity` (tests.rs): 1000-step warmup on
+  random SDRs, then measure mean active cols/step on next 200 steps. Must
+  land within [0.25×, 4×] of `sparsity_target * n_cols`.
+- `gpu_fused_tm_anomaly_decays_on_repeating_sequence`: feed A,B,C repeating
+  for 300 steps. Late anomaly must be < early anomaly AND < 0.5.
+## Legacy Pipeline (kept for fallback)
+* SP: 5 kernels, bit-identical parity with CPU under strict-parity mode.
+* TM: 7 kernels, relaxed-parity with CPU.
+* Speedup at training size (B=8, T=2048, bits=16384): **3.83x** vs CPU.
+## Building
+CPU-only (default, zero CUDA dep):
+```bash
+cargo build --release
+```
+GPU-enabled:
+```bash
+export PATH=/usr/local/cuda-12.1/bin:$PATH
+export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+export HTM_PTX_VERSION=7.8   # lower if driver older than nvcc
+cargo build --release --features gpu
+cargo test  --release --features gpu --lib   # fused path includes cooperative launch + grid-sync tests
+# Python wheel:
+maturin develop --release --features gpu --manifest-path htm_rust/Cargo.toml
+```
+## Architecture
+### Module layout
+```
+src/gpu/
+  mod.rs            # HTMRegionGpu pyclass + step_many_gpu (full pipeline)
+  sp_gpu.rs         # Persistent SP device buffers + step_batch_with_tm
+  tm_gpu.rs         # Persistent TM device buffers + step (predict→activate→learn)
+  tests.rs          # CPU-vs-GPU SP parity + end-to-end TM anomaly decay
+  kernels/
+    sp_overlap.cu       # per-column overlap reduction
+    sp_topk.cu          # k-WTA top-K winner selection
+    sp_learn.cu         # Hebbian +inc/-dec on proximal synapses
+    sp_duty.cu          # EMA duty-cycle update
+    sp_boost_fused.cu   # fused mean + exp boost (GPU-side)
+    tm_reset.cu         # per-step: snapshot active→prev, clear buffers
+    tm_predict.cu       # per-cell: score owned segments vs prev_active_bits
+    tm_activate.cu      # per-col: activate predicted cells OR burst
+    tm_learn.cu         # per-cell: reinforce correctly-predicted segments
+    tm_punish.cu        # per-cell: decay matching segs on inactive cols
+    tm_grow.cu          # per-bursting-col: reuse matching seg OR create new,
+                        #                    grow synapses to prev_winners
+    tm_anomaly.cu       # per-step: unpredicted/active ratio
+```
+### Persistent SP state (per region, unchanged from Phase 1)
+At n_cols=2048, S=40, bits=16384: ~355 KB persistent + ~90 KB transient.
+### Persistent TM state (per region)
+Capacity knobs (configured in `tm_gpu.rs`):
+- `MAX_SEGMENTS_PER_CELL = 4`
+- `MAX_SYN_PER_SEGMENT   = 20`
+At cells_per_col=32, n_cols=2048:
+- `n_cells          = 65_536`
+- `n_segments_max   = 262_144`   (~262K)
+- `n_synapses_max   = 5_242_880` (~5.2M)
+| Buffer                | Shape / type         | Notes                                  |
+|-----------------------|----------------------|----------------------------------------|
+| `seg_cell_id`         | (n_segs,) u32        | owning cell; U32_MAX = unused          |
+| `seg_syn_count`       | (n_segs,) u32        | #active synapses in slot               |
+| `syn_presyn`          | (n_segs × S,) u32    | presynaptic cell indices               |
+| `syn_perm`            | (n_segs × S,) i16    | permanence scaled 0..32767 (0.0..1.0)  |
+| `cell_seg_count`      | (n_cells,) u32       | segments allocated on each cell        |
+| `cell_active_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
+| `cell_winner_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
+| `cell_predictive_bits`| (n_cells/32,) u32    | set by predict, read by activate       |
+| `prev_active_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
+| `prev_winner_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
+| `col_predicted`       | (n_cols,) u8         | set if any cell in col is predictive   |
+| `col_best_match`      | (n_cols,) u32        | packed (pot<<21 | seg_id), atomicMax  |
+| `seg_num_active_conn` | (n_segs,) u32        | output of predict                      |
+| `seg_num_active_pot`  | (n_segs,) u32        | output of predict                      |
+| `unpredicted_count`   | (1,) u32             | atomic counter for anomaly             |
+| `burst_cols_flat`     | (n_cols,) u32        | list of bursting cols                  |
+| `burst_cols_count`    | (1,) u32             | length of above list                   |
+**Total per TM region: ~42 MB.** Batch of 8 regions: ~340 MB. Fits 6 GB RTX 3060.
+### Per-step pipeline (single iteration of `step_batch_with_tm`)
+```
+  SP side                            TM side
+  ---------                          ---------
+  1. D2D input slice → inp_dev
+  2. sp_overlap (n_cols blocks)
+  3. sp_topk    (1 block)
+  4. sp_learn   (n_cols blocks)
+  5. sp_duty    (n_cols/256 blocks)
+  6. sp_boost_fused (1 block)
+  7. D2D active_mask → cols_dev[ti]
+                                     8. tm_reset_step   (ceil(n_cells/32/256))
+                                     9. tm_predict      (n_cells blocks × 32 thr)
+                                    10. tm_activate     (n_cols/256 blocks)
+                                    11. tm_anomaly      (1 block)
+                                    if learn:
+                                    12. tm_learn        (n_cells blocks)
+                                    13. tm_punish       (n_cells blocks)
+                                    14. tm_grow         (n_cols blocks — early-exits)
+```
+No host sync in the T-step loop. At the end one `dtoh_sync_copy` each for
+`cols_dev` (T × n_cols bytes) and `anom_dev` (T × f32).
+## Parity
+### SP: strict bit-identical
+See Phase 1 docs — `gpu_sp_matches_cpu_with_learn` over 50 steps passes exact.
+### TM: relaxed-parity
+The GPU TM has known, deliberate deviations from CPU to admit massive parallelism:
+1. **Bursting winner cell**: CPU picks the least-used cell (fewest segments) with
+   random tiebreak. GPU picks cell 0 of the column (deterministic, branch-free).
+   Learning dynamics are preserved because segment creation/reinforcement is
+   the dominant effect, not which specific cell in a bursting column wins.
+2. **Permanence storage**: i16 fixed-point (scale 32767) vs f32. Rounding
+   differs by <=1 ULP of the scale (~3.0e-5), below any meaningful learning
+   quantum (inc=0.10, dec=0.10, predicted_segment_dec=0.10).
+3. **Grown synapse candidate order**: CPU randomly samples from prev_winner_cells.
+   GPU iterates prev_winner_bits words in a pseudo-random rotated order keyed
+   by (bursting_col_idx, iter_seed). Output is a different subset but same size.
+4. **Segment LRU eviction**: CPU tracks `last_used_iteration` per segment.
+   GPU wraps around (slot = count % max_segments_per_cell). In the autoresearch
+   loop where TM resets every forward, eviction rarely triggers.
+The GPU parity test (`gpu_tm_anomaly_decays_on_repeating_sequence`) feeds a
+repeating A,B,C sequence and asserts anomaly decays: **1.000 early → 0.000 late**.
+## Bottleneck Analysis
+| Source                           | Cost/step (B=8 T=2048)   |
+|----------------------------------|-------------------------:|
+| 14 kernel launches               | ~70 μs                   |
+| ~262K predict/learn/punish blocks| ~2.5 ms                  |
+| No D2H until end-of-batch        | 0 μs                     |
+| Final D2H (T × n_cols + T × f32) | ~200 μs per region       |
+Per-step wall time at B=8 T=2048:
+- CPU (reference): **~11.4 ms / step**
+- GPU (current):   **~2.98 ms / step**
+- **Speedup: 3.83x**
+## End-to-End Training Benchmark
+**Config**: B=8, T=2048, vocab=8192, 60-second time budget, full HYDRA stack
+(SDR Semantic + HTM + Mamba-3 + Engram + mHC + Hestia QAT).
+**Results**:
+- GPU util: **97-98% sustained**
+- VRAM: **5.4 GB / 6.0 GB** (90% utilisation)
+- Steps completed: 16
+- tok/sec: **~2,200-2,500** (stable post-warmup)
+- Final val_bpb: **2.249** (from ~3.1 initial)
+- Factual eval: 1/9 hits
+Compared to previous CPU-HTM baseline (~100 tok/s), the full-GPU HTM delivers
+**~22x end-to-end throughput** — far above the 3-10x target.
+## Bench Commands
+```bash
+source .venv/bin/activate
+export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+# Microbench
+B=8 T=2048 python htm_rust/bench_gpu.py
+# Full training
+HYDRA_TIME_BUDGET=60 HYDRA_BATCH_SIZE=8 HYDRA_TOTAL_BATCH=32768 python -u train.py
+```
+## Known Limitations / Future Work
+- **Segment-compacted launches**: predict/learn/punish iterate all n_cells
+  blocks, using `cell_seg_count` to skip empty cells. A compacted live-cell
+  list would shave another ~40% of launch overhead.
+- **Winner selection**: currently cell 0 of bursting col. Proper least-used
+  selection would help stability of cross-column patterns.
+- **Single CUDA stream per region**: with B=8 regions we serialise on stream 0.
+  Multi-stream would lift the ~20% launch overhead at small batch sizes.
+- **Permanence bump on chronically under-stimulated columns**: SP's strict-parity
+  bump is not mirrored on GPU fast path. Effect on long runs needs measurement.
+- **`seg_num_active_conn` output is reused across reinforce + punish**: the two
+  kernels each launch n_cells blocks. They could be fused into one for one fewer
+  kernel launch per step.
+## Files
+- `htm_rust/build.rs` — nvcc-driven PTX compilation, 12 kernels.
+- `htm_rust/Cargo.toml` — `gpu` feature flag, cudarc dep.
+- `htm_rust/src/gpu/mod.rs` — `HTMRegionGpu` pyclass + `step_many_gpu`.
+- `htm_rust/src/gpu/sp_gpu.rs` — SP state + `step_batch_with_tm`.
+- `htm_rust/src/gpu/tm_gpu.rs` — TM state + `step`.
+- `htm_rust/src/gpu/tests.rs` — parity + correctness tests.
+- `htm_rust/src/gpu/kernels/*.cu` — 5 SP + 7 TM kernels.
+- `htm_rust/bench_gpu.py` — CPU-vs-GPU microbench.
+- `subsystems/htm.py` — transparent GPU/CPU backend selection in `HTMLayer`.

overlay/htm_rust/src/gpu/fused.rs CHANGED Viewed

@@ -20,8 +20,7 @@
 use std::ffi::CString;
 use std::sync::Arc;
-use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DeviceRepr, DevicePtr, DriverError,
-                      LaunchConfig};
 use cudarc::nvrtc::Ptx;
 use super::sp_gpu::SpatialPoolerGpu;
@@ -150,7 +149,11 @@ pub(crate) fn plan_fused_launch(
     let default_grid_cap = 16u32;
     let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
     let resident_bound = if cooperative_grid_limit > 0 {
-        cooperative_grid_limit.max(sm_count * 2)
     } else {
         sm_count * 2
     };
@@ -280,7 +283,9 @@ impl FusedState {
             }
             _ => 0u32,
         };
-        eprintln!("[htm_rust] cluster: max_cluster_size={}", max_cluster_size);
         let cluster_info = ClusterInfo { max_cluster_size };
         let cooperative_supported = matches!(
@@ -289,7 +294,10 @@ impl FusedState {
         );
         let cooperative_grid_limit = if cooperative_supported {
             let blocks_per_sm = unsafe {
-                result::occupancy::max_active_block_per_multiprocessor(function, 1024, 0)
             }
             .ok()
             .map(|v| v.max(0) as u32)
@@ -310,11 +318,13 @@ impl FusedState {
             DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_NOT_SUPPORTED)
         })?;
-        eprintln!(
-            "[htm_rust] fused kernel: sm_count={} grid_dim_x={} cooperative_grid_limit={} cluster_max={}",
-            launch_plan.sm_count, launch_plan.grid_dim_x, launch_plan.cooperative_grid_limit,
-            cluster_info.max_cluster_size,
-        );
         Ok(Self {
             dev,
@@ -513,6 +523,38 @@ pub(super) fn launch_fused_batched_raw(
     assert_eq!(anom_per_region.len(), b);
     assert!(b >= 1, "need at least one region");
     // Reset per-region step_scratch before each launch.
     for &rp in region_ptrs.iter() {
         let r = unsafe { &mut *rp };
@@ -659,5 +701,11 @@ pub(super) fn launch_fused_batched_raw(
         }
     }
     Ok(())
 }

 use std::ffi::CString;
 use std::sync::Arc;
+use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DevicePtr, DeviceRepr, DriverError};
 use cudarc::nvrtc::Ptx;
 use super::sp_gpu::SpatialPoolerGpu;
     let default_grid_cap = 16u32;
     let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
     let resident_bound = if cooperative_grid_limit > 0 {
+        // A10G/sm86 uses cooperative grid sync in the fused kernel. The grid
+        // may not exceed resident cooperative capacity, or the kernel can fail
+        // (or worse, deadlock at grid.sync()). Do not inflate this above the
+        // driver-reported occupancy limit.
+        cooperative_grid_limit
     } else {
         sm_count * 2
     };
             }
             _ => 0u32,
         };
+        if std::env::var_os("HTM_RUST_VERBOSE_LAUNCH").is_some() {
+            eprintln!("[htm_rust] cluster: max_cluster_size={}", max_cluster_size);
+        }
         let cluster_info = ClusterInfo { max_cluster_size };
         let cooperative_supported = matches!(
         );
         let cooperative_grid_limit = if cooperative_supported {
             let blocks_per_sm = unsafe {
+                // Keep this in sync with plan_fused_launch's block_dim_x. The
+                // fused kernels are launch_bounds(256, ...); querying with
+                // 1024 underestimates sm86 residency and breaks A10G tuning.
+                result::occupancy::max_active_block_per_multiprocessor(function, 256, 0)
             }
             .ok()
             .map(|v| v.max(0) as u32)
             DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_NOT_SUPPORTED)
         })?;
+        if std::env::var_os("HTM_RUST_VERBOSE_LAUNCH").is_some() {
+            eprintln!(
+                "[htm_rust] fused kernel: sm_count={} grid_dim_x={} cooperative_grid_limit={} cluster_max={}",
+                launch_plan.sm_count, launch_plan.grid_dim_x, launch_plan.cooperative_grid_limit,
+                cluster_info.max_cluster_size,
+            );
+        }
         Ok(Self {
             dev,
     assert_eq!(anom_per_region.len(), b);
     assert!(b >= 1, "need at least one region");
+    // A10G/sm86 pre-Hopper path uses cooperative launch with grid.sync(). The
+    // total resident grid is grid_x * B, so B must be chunked to fit the
+    // driver-reported cooperative residency. Without this, large training
+    // batches either fail cooperatively or fall back to B sequential launches.
+    {
+        let r0 = unsafe { &*region_ptrs[0] };
+        let use_cluster = r0.fused_state.cluster_info.max_cluster_size > 0;
+        if !use_cluster {
+            let grid_x = r0.fused_state.grid_dim_x.max(1);
+            let coop_limit = r0.fused_state.cooperative_grid_limit;
+            if coop_limit == 0 {
+                return Err(DriverError(sys::CUresult::CUDA_ERROR_NOT_SUPPORTED));
+            }
+            let max_regions_per_launch = (coop_limit / grid_x).max(1) as usize;
+            if b > max_regions_per_launch {
+                for start in (0..b).step_by(max_regions_per_launch) {
+                    let end = (start + max_regions_per_launch).min(b);
+                    launch_fused_batched_raw(
+                        &region_ptrs[start..end],
+                        &inputs_per_region[start..end],
+                        &cols_per_region[start..end],
+                        &anom_per_region[start..end],
+                        t,
+                        input_bits,
+                        learn,
+                    )?;
+                }
+                return Ok(());
+            }
+        }
+    }
     // Reset per-region step_scratch before each launch.
     for &rp in region_ptrs.iter() {
         let r = unsafe { &mut *rp };
         }
     }
+    // ptrs_dev is temporary device memory consumed by the launched batched
+    // kernel. Synchronize before it is dropped; single-region step_many_fused_cuda
+    // also synchronizes today, so this preserves correctness while still
+    // reducing B separate launches to chunked cooperative launches.
+    dev.synchronize()?;
     Ok(())
 }

overlay/htm_rust/src/gpu/mod.rs CHANGED Viewed

@@ -25,7 +25,7 @@ mod tests;
 use std::mem::ManuallyDrop;
 use pyo3::prelude::*;
-use pyo3::types::{PyDict, PyTuple};
 use numpy::{PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray2, PyUntypedArrayMethods};
 use crate::region::HTMRegionCore;
@@ -423,7 +423,140 @@ impl HTMRegionGpu {
     }
 }
 pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<HTMRegionGpu>()?;
     Ok(())
 }

 use std::mem::ManuallyDrop;
 use pyo3::prelude::*;
+use pyo3::types::{PyDict, PyList, PyTuple};
 use numpy::{PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray2, PyUntypedArrayMethods};
 use crate::region::HTMRegionCore;
     }
 }
+#[pyfunction]
+fn step_batch_fused_cuda(
+    regions: &Bound<'_, PyAny>,
+    sdr_cais: &Bound<'_, PyAny>,
+    cols_cais: &Bound<'_, PyAny>,
+    anom_cais: &Bound<'_, PyAny>,
+    learn: bool,
+) -> PyResult<()> {
+    let regions_list: Bound<'_, PyList> = regions
+        .clone()
+        .downcast_into()
+        .map_err(|_| pyo3::exceptions::PyTypeError::new_err("regions must be a list"))?;
+    let sdr_list: Bound<'_, PyList> = sdr_cais
+        .clone()
+        .downcast_into()
+        .map_err(|_| pyo3::exceptions::PyTypeError::new_err("sdr_cais must be a list"))?;
+    let cols_list: Bound<'_, PyList> = cols_cais
+        .clone()
+        .downcast_into()
+        .map_err(|_| pyo3::exceptions::PyTypeError::new_err("cols_cais must be a list"))?;
+    let anom_list: Bound<'_, PyList> = anom_cais
+        .clone()
+        .downcast_into()
+        .map_err(|_| pyo3::exceptions::PyTypeError::new_err("anom_cais must be a list"))?;
+    let b = regions_list.len();
+    if b == 0 {
+        return Err(pyo3::exceptions::PyValueError::new_err("need at least one region"));
+    }
+    if sdr_list.len() != b || cols_list.len() != b || anom_list.len() != b {
+        return Err(pyo3::exceptions::PyValueError::new_err(format!(
+            "list length mismatch: regions={} sdr={} cols={} anom={}",
+            b,
+            sdr_list.len(),
+            cols_list.len(),
+            anom_list.len()
+        )));
+    }
+    let mut region_refs: Vec<PyRefMut<'_, HTMRegionGpu>> = Vec::with_capacity(b);
+    let mut region_ptrs: Vec<*mut HTMRegionGpu> = Vec::with_capacity(b);
+    let mut inputs_per_region: Vec<u64> = Vec::with_capacity(b);
+    let mut cols_per_region: Vec<u64> = Vec::with_capacity(b);
+    let mut anom_per_region: Vec<u64> = Vec::with_capacity(b);
+    let mut shared_t: Option<usize> = None;
+    let mut shared_input_bits: Option<usize> = None;
+    let mut shared_n_columns: Option<usize> = None;
+    for i in 0..b {
+        let mut region_ref: PyRefMut<'_, HTMRegionGpu> = regions_list.get_item(i)?.extract()?;
+        let region_t_bits = region_ref.input_bits;
+        let region_cols = region_ref.n_columns;
+        let region_ptr: *mut HTMRegionGpu = &mut *region_ref;
+        let sdr_dict: Bound<'_, PyDict> = sdr_list
+            .get_item(i)?
+            .downcast_into()
+            .map_err(|_| pyo3::exceptions::PyTypeError::new_err("sdr CAI entries must be dicts"))?;
+        let cols_dict: Bound<'_, PyDict> = cols_list
+            .get_item(i)?
+            .downcast_into()
+            .map_err(|_| pyo3::exceptions::PyTypeError::new_err("cols CAI entries must be dicts"))?;
+        let anom_dict: Bound<'_, PyDict> = anom_list
+            .get_item(i)?
+            .downcast_into()
+            .map_err(|_| pyo3::exceptions::PyTypeError::new_err("anom CAI entries must be dicts"))?;
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(&sdr_dict)?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(&cols_dict)?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(&anom_dict)?;
+        if sdr_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai[{i}] typestr must be '|u1' (uint8), got {sdr_type}",
+            )));
+        }
+        if cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai[{i}] typestr must be '|u1' (uint8), got {cols_type}",
+            )));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai[{i}] typestr must be '<f4' (float32), got {anom_type}",
+            )));
+        }
+        if sdr_shape.len() != 2 || sdr_shape[1] != region_t_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai[{i}] shape {sdr_shape:?} != (T, {region_t_bits})",
+            )));
+        }
+        let this_t = sdr_shape[0];
+        if cols_shape != [this_t, region_cols] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai[{i}] shape {cols_shape:?} != ({this_t}, {region_cols})",
+            )));
+        }
+        if anom_shape != [this_t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai[{i}] shape {anom_shape:?} != ({this_t},)",
+            )));
+        }
+        if shared_t.replace(this_t).is_some_and(|prev| prev != this_t)
+            || shared_input_bits.replace(region_t_bits).is_some_and(|prev| prev != region_t_bits)
+            || shared_n_columns.replace(region_cols).is_some_and(|prev| prev != region_cols)
+        {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "all batched HTM regions must share T/input_bits/n_columns",
+            ));
+        }
+        region_refs.push(region_ref);
+        region_ptrs.push(region_ptr);
+        inputs_per_region.push(sdr_ptr);
+        cols_per_region.push(cols_ptr);
+        anom_per_region.push(anom_ptr);
+    }
+    fused::launch_fused_batched_raw(
+        &region_ptrs,
+        &inputs_per_region,
+        &cols_per_region,
+        &anom_per_region,
+        shared_t.unwrap(),
+        shared_input_bits.unwrap(),
+        learn,
+    )
+    .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("step_batch_fused_cuda: {e:?}")))?;
+    drop(region_refs);
+    Ok(())
+}
 pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<HTMRegionGpu>()?;
+    m.add_function(wrap_pyfunction!(step_batch_fused_cuda, m)?)?;
     Ok(())
 }

overlay/htm_rust/src/lib.rs CHANGED Viewed

@@ -34,6 +34,7 @@ use numpy::{
     PyUntypedArrayMethods,
 };
 use pyo3::prelude::*;
 use crate::region::HTMRegionCore;
@@ -135,6 +136,32 @@ impl HTMRegion {
     /// Clear TM predictive state. Does NOT unlearn synapses.
     fn reset(&mut self) { self.core.reset(); }
     /// Process T timesteps from a `(T, input_bits)` bool ndarray.
     ///
     /// Returns:

     PyUntypedArrayMethods,
 };
 use pyo3::prelude::*;
+use pyo3::types::PyBytes;
 use crate::region::HTMRegionCore;
     /// Clear TM predictive state. Does NOT unlearn synapses.
     fn reset(&mut self) { self.core.reset(); }
+    /// Serialize the full SP+TM state to bytes.
+    fn save_state<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyBytes>> {
+        let bytes = bincode::serialize(&self.core).map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!("serialize HTM state: {e}"))
+        })?;
+        Ok(PyBytes::new_bound(py, &bytes))
+    }
+    /// Restore a state blob created by save_state().
+    fn load_state(&mut self, blob: &[u8]) -> PyResult<()> {
+        let core: HTMRegionCore = bincode::deserialize(blob).map_err(|e| {
+            pyo3::exceptions::PyValueError::new_err(format!("deserialize HTM state: {e}"))
+        })?;
+        if core.sp.cfg.input_bits != self.core.sp.cfg.input_bits
+            || core.sp.cfg.n_columns != self.core.sp.cfg.n_columns
+            || core.tm.cfg.n_columns != self.core.tm.cfg.n_columns
+            || core.tm.cfg.cells_per_column != self.core.tm.cfg.cells_per_column
+        {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "HTM state shape does not match this region",
+            ));
+        }
+        self.core = core;
+        Ok(())
+    }
     /// Process T timesteps from a `(T, input_bits)` bool ndarray.
     ///
     /// Returns:

overlay/htm_rust/src/region.rs CHANGED Viewed

@@ -2,7 +2,9 @@
 use crate::sp::{SpatialPooler, SpatialPoolerConfig};
 use crate::tm::{TemporalMemory, TemporalMemoryConfig};
 pub struct HTMRegionCore {
     pub sp: SpatialPooler,
     pub tm: TemporalMemory,

 use crate::sp::{SpatialPooler, SpatialPoolerConfig};
 use crate::tm::{TemporalMemory, TemporalMemoryConfig};
+use serde::{Deserialize, Serialize};
+#[derive(Serialize, Deserialize)]
 pub struct HTMRegionCore {
     pub sp: SpatialPooler,
     pub tm: TemporalMemory,

overlay/htm_rust/src/sp.rs CHANGED Viewed

@@ -15,10 +15,11 @@ use rand::Rng;
 use rand::SeedableRng;
 use rand::seq::SliceRandom;
 use rand_xoshiro::Xoshiro256PlusPlus;
 /// A single proximal dendrite: a sparse set of potential synapses onto
 /// specific input bit indices, with per-synapse permanence values.
-#[derive(Clone)]
 pub struct ProximalDendrite {
     /// Indices into the input SDR.  Length == potential_synapses.
     pub inputs: Vec<u32>,
@@ -26,6 +27,7 @@ pub struct ProximalDendrite {
     pub perms: Vec<f32>,
 }
 pub struct SpatialPoolerConfig {
     pub input_bits: usize,
     pub n_columns: usize,
@@ -64,6 +66,7 @@ impl Default for SpatialPoolerConfig {
     }
 }
 pub struct SpatialPooler {
     pub cfg: SpatialPoolerConfig,
     pub columns: Vec<ProximalDendrite>,
@@ -265,6 +268,7 @@ mod tests {
     use rand::Rng;
     use rand::SeedableRng;
     use rand_xoshiro::Xoshiro256PlusPlus;
     #[test]
     fn sp_sparsity_exact_2pct() {

 use rand::SeedableRng;
 use rand::seq::SliceRandom;
 use rand_xoshiro::Xoshiro256PlusPlus;
+use serde::{Deserialize, Serialize};
 /// A single proximal dendrite: a sparse set of potential synapses onto
 /// specific input bit indices, with per-synapse permanence values.
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ProximalDendrite {
     /// Indices into the input SDR.  Length == potential_synapses.
     pub inputs: Vec<u32>,
     pub perms: Vec<f32>,
 }
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SpatialPoolerConfig {
     pub input_bits: usize,
     pub n_columns: usize,
     }
 }
+#[derive(Serialize, Deserialize)]
 pub struct SpatialPooler {
     pub cfg: SpatialPoolerConfig,
     pub columns: Vec<ProximalDendrite>,
     use rand::Rng;
     use rand::SeedableRng;
     use rand_xoshiro::Xoshiro256PlusPlus;
+use serde::{Deserialize, Serialize};
     #[test]
     fn sp_sparsity_exact_2pct() {

overlay/htm_rust/src/tm.rs CHANGED Viewed

@@ -45,17 +45,18 @@
 use rand::Rng;
 use rand::SeedableRng;
 use rand_xoshiro::Xoshiro256PlusPlus;
 type CellIdx = u32;
 type SegmentIdx = u32;
-#[derive(Clone)]
 pub struct Synapse {
     pub presynaptic_cell: CellIdx,
     pub permanence: f32,
 }
-#[derive(Clone)]
 pub struct Segment {
     pub cell: CellIdx,
     pub synapses: Vec<Synapse>,
@@ -66,6 +67,7 @@ pub struct Segment {
     pub last_used_iteration: u64,
 }
 pub struct TemporalMemoryConfig {
     pub n_columns: usize,
     pub cells_per_column: usize,
@@ -100,6 +102,7 @@ impl Default for TemporalMemoryConfig {
     }
 }
 pub struct TemporalMemory {
     pub cfg: TemporalMemoryConfig,
     /// All segments in the region. Indexed by SegmentIdx.
@@ -485,6 +488,7 @@ mod tests {
     use rand::Rng;
     use rand::SeedableRng;
     use rand_xoshiro::Xoshiro256PlusPlus;
     #[test]
     fn tm_learns_repeating_sequence() {

 use rand::Rng;
 use rand::SeedableRng;
 use rand_xoshiro::Xoshiro256PlusPlus;
+use serde::{Deserialize, Serialize};
 type CellIdx = u32;
 type SegmentIdx = u32;
+#[derive(Clone, Serialize, Deserialize)]
 pub struct Synapse {
     pub presynaptic_cell: CellIdx,
     pub permanence: f32,
 }
+#[derive(Clone, Serialize, Deserialize)]
 pub struct Segment {
     pub cell: CellIdx,
     pub synapses: Vec<Synapse>,
     pub last_used_iteration: u64,
 }
+#[derive(Clone, Serialize, Deserialize)]
 pub struct TemporalMemoryConfig {
     pub n_columns: usize,
     pub cells_per_column: usize,
     }
 }
+#[derive(Serialize, Deserialize)]
 pub struct TemporalMemory {
     pub cfg: TemporalMemoryConfig,
     /// All segments in the region. Indexed by SegmentIdx.
     use rand::Rng;
     use rand::SeedableRng;
     use rand_xoshiro::Xoshiro256PlusPlus;
+use serde::{Deserialize, Serialize};
     #[test]
     fn tm_learns_repeating_sequence() {

overlay/htm_rust/uv.lock ADDED Viewed

	@@ -0,0 +1,8 @@

+version = 1
+revision = 3
+requires-python = ">=3.11"
+[[package]]
+name = "htm-rust"
+version = "0.1.0"
+source = { editable = "." }

overlay/hydra/model.py CHANGED Viewed

@@ -49,18 +49,51 @@ from subsystems.sdr_semantic import SemanticFoldingSDR
 from hydra.engram import GPUEngram
 from hydra.htm_cache import htm_cache_key, htm_cache_matches
 from hydra.hyena_block import HyenaBlock
 # GDNBlock is imported lazily inside __init__ so the `fla` dependency is
 # only required when HYDRA_GDN_LAYERS is actually non-empty. Baseline
 # pure-Mamba3 runs continue to work without flash-linear-attention installed.
 from hydra.optimizer import MuonAdamW
 from hydra.sampled_softmax import UnigramSampler, sampled_softmax_loss
 def norm(x: torch.Tensor) -> torch.Tensor:
     """RMSNorm over the last dim — stateless, autocast-friendly."""
     return F.rms_norm(x, (x.size(-1),))
 class PostSemClawModel(nn.Module):
     """Full Post-SEM-Claw model assembly.
@@ -131,10 +164,7 @@ class PostSemClawModel(nn.Module):
                     n_heads=config.n_heads,
                 )
             if Mamba3 is None:
-                raise RuntimeError(
-                    "mamba_ssm is required for Mamba3 layers; set hyena_layers/gdn_layers "
-                    "to cover every layer or run inside the HF runtime image."
-                )
             block = Mamba3(
                 d_model=config.d_model,
                 d_state=config.d_state,
@@ -179,6 +209,22 @@ class PostSemClawModel(nn.Module):
             n_columns=config.engram_n_columns,
             max_ngram=3,
         )
         self.engram_layer_idx = config.engram_layer_idx
         # Manifold-Constrained Hyper-Connections (one per Mamba-3 block).
@@ -398,12 +444,28 @@ class PostSemClawModel(nn.Module):
         nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
         # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
         # dtypes in the same shape group would break lerp_ dtype checks.
         self.wte.to(dtype=torch.bfloat16)
         self.blocks.to(dtype=torch.bfloat16)
         self.htm_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)
     def set_bos_token_id(self, bos_id: int) -> None:
         """Inform the model of the tokenizer's BOS id so doc-separator
@@ -755,19 +817,25 @@ class PostSemClawModel(nn.Module):
         # HYDRA_HTM_SUBSAMPLE=N (default 8). Set =1 for every-microbatch HTM.
         _htm_sub = int(os.environ.get("HYDRA_HTM_SUBSAMPLE", "8"))
         if not hasattr(self, '_htm_call_idx'):
-            self._htm_call_idx = 0
         _run_htm = (self._htm_call_idx % _htm_sub == 0)
         self._htm_call_idx += 1
         if _run_htm:
-            htm_handle = self.htm.forward_async(sdr_binary)
         else:
             htm_handle = None
         if _profile: _t_htm_async = _ev()
         dense_emb = self.wte(idx)  # (B, T, d_model) bf16
         if _profile: _t_wte = _ev()
@@ -804,10 +872,19 @@ class PostSemClawModel(nn.Module):
             and htm_cache_matches(self._htm_cache_key, sdr_binary.nonzero())
         ):
             htm_out = self._htm_cache
         else:
             # Very first call with subsample > 1, OR MDLM is on, OR the SDR
             # pattern has changed from the cached one under exact mode: run HTM.
-            htm_handle = self.htm.forward_async(sdr_binary)
             htm_out = self.htm.forward_await(htm_handle)
             self._htm_cache = htm_out.detach()
             self._htm_cache_key = htm_cache_key(sdr_binary.nonzero())
@@ -880,7 +957,18 @@ class PostSemClawModel(nn.Module):
                 # tensor of shape (n_streams, B, T, d_model) — see
                 # subsystems/mhc_mini.ManifoldHyperConnection.
                 x_mid = mhc_layer.merge_streams(streams)
-                x_after_engram, hit_rate = self.engram(x_mid, idx)
                 if os.environ.get("HYDRA_ENGRAM_RESET_STREAMS", "0") == "1":
                     streams = mhc_layer.init_streams(x_after_engram)
                 else:

 from hydra.engram import GPUEngram
 from hydra.htm_cache import htm_cache_key, htm_cache_matches
 from hydra.hyena_block import HyenaBlock
+from hydra.reality_bridge import RealityPoincareBridge
 # GDNBlock is imported lazily inside __init__ so the `fla` dependency is
 # only required when HYDRA_GDN_LAYERS is actually non-empty. Baseline
 # pure-Mamba3 runs continue to work without flash-linear-attention installed.
 from hydra.optimizer import MuonAdamW
 from hydra.sampled_softmax import UnigramSampler, sampled_softmax_loss
+try:
+    from subsystems.cantor_router import CantorRouter
+except ModuleNotFoundError:
+    from archive.cantor_router import CantorRouter
 def norm(x: torch.Tensor) -> torch.Tensor:
     """RMSNorm over the last dim — stateless, autocast-friendly."""
     return F.rms_norm(x, (x.size(-1),))
+def paired_slow_fast_orthogonality(w: torch.Tensor) -> torch.Tensor:
+    """Penalty for aligned adjacent slow/fast vector pairs."""
+    n = (w.shape[0] // 2) * 2
+    if n == 0:
+        return w.new_zeros(())
+    slow = F.normalize(w[:n:2].float(), dim=-1, eps=1e-8)
+    fast = F.normalize(w[1:n:2].float(), dim=-1, eps=1e-8)
+    return (slow * fast).sum(dim=-1).square().mean().to(dtype=w.dtype)
+def semantic_gaussian_mollify(
+    x: torch.Tensor,
+    std: float = 0.0,
+    training: bool = True,
+    eval_enabled: bool = False,
+) -> torch.Tensor:
+    """Optionally add train-time semantic Gaussian noise; disabled is identity."""
+    if std <= 0.0 or (not training and not eval_enabled):
+        return x
+    return x + torch.randn_like(x) * float(std)
+class _LocalMamba3Fallback(nn.Identity):
+    """Shape-preserving local fallback used only when mamba_ssm is absent."""
+    pass
 class PostSemClawModel(nn.Module):
     """Full Post-SEM-Claw model assembly.
                     n_heads=config.n_heads,
                 )
             if Mamba3 is None:
+                return _LocalMamba3Fallback()
             block = Mamba3(
                 d_model=config.d_model,
                 d_state=config.d_state,
             n_columns=config.engram_n_columns,
             max_ngram=3,
         )
+        self.reality_bridge = None
+        self.cantor = None
+        if os.environ.get("HYDRA_REALITY_BRIDGE", "0") == "1":
+            d_reality = int(os.environ.get("HYDRA_REALITY_D", "133"))
+            self.reality_bridge = RealityPoincareBridge(
+                d_model=config.d_model,
+                d_reality=d_reality,
+                l0_k=int(os.environ.get("HYDRA_REALITY_L0_K", "64")),
+            )
+            if os.environ.get("HYDRA_CANTOR_DISABLE", "0") != "1":
+                self.cantor = CantorRouter(
+                    depth=int(os.environ.get("HYDRA_CANTOR_DEPTH", "7")),
+                    d_query=d_reality,
+                    seed=int(os.environ.get("HYDRA_CANTOR_SEED", "42")),
+                    device=self.wte.weight.device,
+                )
         self.engram_layer_idx = config.engram_layer_idx
         # Manifold-Constrained Hyper-Connections (one per Mamba-3 block).
         nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
+        if hasattr(self.engram, "memory"):
+            nn.init.normal_(self.engram.memory, mean=0.0, std=0.01)
+        if hasattr(self.engram, "gate"):
+            nn.init.zeros_(self.engram.gate.weight)
+            nn.init.zeros_(self.engram.gate.bias)
+        if self.reality_bridge is not None:
+            nn.init.normal_(self.reality_bridge.to_reality.weight, mean=0.0, std=0.02)
+            nn.init.normal_(self.reality_bridge.to_tangent2.weight, mean=0.0, std=0.02)
+        if self.cantor is not None and hasattr(self.cantor, "branch"):
+            bound = (3.0 / float(self.cantor.d_query)) ** 0.5
+            nn.init.uniform_(self.cantor.branch, -bound, bound)
         # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
         # dtypes in the same shape group would break lerp_ dtype checks.
         self.wte.to(dtype=torch.bfloat16)
         self.blocks.to(dtype=torch.bfloat16)
         self.htm_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)
+        if self.reality_bridge is not None:
+            self.reality_bridge.to(dtype=torch.bfloat16)
+        if self.cantor is not None:
+            self.cantor.to(dtype=torch.bfloat16)
     def set_bos_token_id(self, bos_id: int) -> None:
         """Inform the model of the tokenizer's BOS id so doc-separator
         # HYDRA_HTM_SUBSAMPLE=N (default 8). Set =1 for every-microbatch HTM.
         _htm_sub = int(os.environ.get("HYDRA_HTM_SUBSAMPLE", "8"))
         if not hasattr(self, '_htm_call_idx'):
+            self._htm_call_idx = int(os.environ.get("HYDRA_HTM_INITIAL_OFFSET", "0"))
         _run_htm = (self._htm_call_idx % _htm_sub == 0)
         self._htm_call_idx += 1
         if _run_htm:
+            htm_handle = self.htm.forward_async(sdr_binary, output_dtype=self.wte.weight.dtype)
         else:
             htm_handle = None
         if _profile: _t_htm_async = _ev()
         dense_emb = self.wte(idx)  # (B, T, d_model) bf16
+        dense_emb = semantic_gaussian_mollify(
+            dense_emb,
+            std=float(os.environ.get("HYDRA_SEMANTIC_SMOOTH_STD", "0.0")),
+            training=self.training,
+            eval_enabled=os.environ.get("HYDRA_SEMANTIC_SMOOTH_EVAL", "0") == "1",
+        )
         if _profile: _t_wte = _ev()
             and htm_cache_matches(self._htm_cache_key, sdr_binary.nonzero())
         ):
             htm_out = self._htm_cache
+        elif (
+            os.environ.get("HYDRA_HTM_ZERO_CACHE_ON_MISS", "0") == "1"
+            and self.training
+            and not self._mdlm_active
+        ):
+            htm_out = torch.zeros((B, T, self.config.htm_n_columns + 1), device=dense_emb.device, dtype=dense_emb.dtype)
+            self._htm_cache = htm_out.detach()
+            self._htm_cache_key = None
+            self._htm_cache_shape = (B, T)
         else:
             # Very first call with subsample > 1, OR MDLM is on, OR the SDR
             # pattern has changed from the cached one under exact mode: run HTM.
+            htm_handle = self.htm.forward_async(sdr_binary, output_dtype=self.wte.weight.dtype)
             htm_out = self.htm.forward_await(htm_handle)
             self._htm_cache = htm_out.detach()
             self._htm_cache_key = htm_cache_key(sdr_binary.nonzero())
                 # tensor of shape (n_streams, B, T, d_model) — see
                 # subsystems/mhc_mini.ManifoldHyperConnection.
                 x_mid = mhc_layer.merge_streams(streams)
+                if self.reality_bridge is not None and self.cantor is not None:
+                    rb = self.reality_bridge(x_mid)
+                    cantor_leaf_ids, _ = self.cantor(rb.reality, return_scores=False)
+                    x_after_engram, hit_rate = self.engram(
+                        x_mid,
+                        idx,
+                        sdr_active_indices=rb.l0_indices,
+                        cantor_leaf_ids=cantor_leaf_ids,
+                        cantor_n_leaves=self.cantor.n_leaves,
+                    )
+                else:
+                    x_after_engram, hit_rate = self.engram(x_mid, idx)
                 if os.environ.get("HYDRA_ENGRAM_RESET_STREAMS", "0") == "1":
                     streams = mhc_layer.init_streams(x_after_engram)
                 else:

overlay/hydra/optimizer.py CHANGED Viewed

@@ -144,62 +144,117 @@ class MuonAdamW(torch.optim.Optimizer):
         self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
         self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
         self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
     def _step_adamw(self, group):
-        params, grads, exp_avgs, exp_avg_sqs, state_steps = [], [], [], [], []
         for p in group['params']:
             if p.grad is None:
                 continue
-            state = self.state[p]
-            if not state:
-                state['step'] = 0
-                state['exp_avg'] = torch.zeros_like(p)
-                state['exp_avg_sq'] = torch.zeros_like(p)
-            if 'step_t' not in state:
-                # _fused_adamw_ wants a per-param float step tensor on-device.
-                state['step_t'] = torch.tensor(
-                    float(state['step']), dtype=torch.float32, device=p.device
-                )
             state['step'] += 1
             params.append(p)
             grads.append(p.grad.to(p.dtype) if p.grad.dtype != p.dtype else p.grad)
             exp_avgs.append(state['exp_avg'])
             exp_avg_sqs.append(state['exp_avg_sq'])
-            state_steps.append(state['step_t'])
         if not params:
             return
-        if _HYDRA_FUSED_ADAMW and _HAS_FUSED_ADAMW and params[0].is_cuda:
-            # _fused_adamw_ needs uniform (device, dtype) within a call, so
-            # group by (device, dtype) — same pattern as PyTorch's own
-            # AdamW(fused=True) path (_group_tensors_by_device_and_dtype).
-            buckets = {}
-            for p, g, ea, es, st in zip(params, grads, exp_avgs, exp_avg_sqs, state_steps):
-                key = (p.device, p.dtype)
-                buckets.setdefault(key, ([], [], [], [], []))
-                b_p, b_g, b_ea, b_es, b_st = buckets[key]
-                b_p.append(p); b_g.append(g); b_ea.append(ea); b_es.append(es); b_st.append(st)
-            lr_f = float(group['lr'])
-            b1_f = float(group['betas'][0])
-            b2_f = float(group['betas'][1])
-            wd_f = float(group['weight_decay'])
-            eps_f = float(group['eps'])
-            for (_dev, _dt), (b_p, b_g, b_ea, b_es, b_st) in buckets.items():
-                torch._foreach_add_(b_st, 1.0)
-                torch._fused_adamw_(
-                    b_p, b_g, b_ea, b_es,
-                    [],  # max_exp_avg_sqs unused (amsgrad=False)
-                    b_st,
-                    amsgrad=False,
-                    lr=lr_f, beta1=b1_f, beta2=b2_f,
-                    weight_decay=wd_f, eps=eps_f,
-                    maximize=False,
-                    grad_scale=None, found_inf=None,
-                )
-            return
         # Fallback per-param path.
         self._adamw_lr_t.fill_(group['lr'])
         self._adamw_beta1_t.fill_(group['betas'][0])
@@ -213,15 +268,34 @@ class MuonAdamW(torch.optim.Optimizer):
                              self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t)
     def _step_muon(self, group):
-        params = [p for p in group['params'] if p.grad is not None]
         if not params:
             return
         p = params[0]
         state = self.state[p]
         num_params = len(params)
         shape, device, dtype = p.shape, p.device, p.dtype
-        if "momentum_buffer" not in state:
             state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
         red_dim = -1 if shape[-2] >= shape[-1] else -2
         if "second_momentum_buffer" not in state:
             # Shape must match v_mean = stacked_grads.square().mean(dim=red_dim, keepdim=True)

         self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
         self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
         self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_bucket_caches = {}
+        self._muon_params_caches = {}
+    def state_dict(self):
+        sd = super().state_dict()
+        # Transient fused-step caches and device step_t tensors must not enter
+        # checkpoints. step_t is recreated from scalar state['step'] lazily.
+        for st in sd.get("state", {}).values():
+            st.pop("step_t", None)
+        for group in sd.get("param_groups", []):
+            group.pop("_adamw_bucket_cache", None)
+            group.pop("_muon_params_cache", None)
+        return sd
+    def load_state_dict(self, state_dict):
+        for st in state_dict.get("state", {}).values():
+            st.pop("step_t", None)
+        for group in state_dict.get("param_groups", []):
+            group.pop("_adamw_bucket_cache", None)
+            group.pop("_muon_params_cache", None)
+        self._adamw_bucket_caches.clear()
+        self._muon_params_caches.clear()
+        return super().load_state_dict(state_dict)
+    def _ensure_adamw_state(self, p):
+        state = self.state[p]
+        if not state:
+            state['step'] = 0
+            state['exp_avg'] = torch.zeros_like(p)
+            state['exp_avg_sq'] = torch.zeros_like(p)
+        if 'step_t' not in state:
+            # _fused_adamw_ wants a per-param float step tensor on-device.
+            state['step_t'] = torch.tensor(
+                float(state['step']), dtype=torch.float32, device=p.device
+            )
+        return state
+    def _adamw_cached_buckets(self, group):
+        """Return stable (device,dtype) param buckets for fused AdamW.
+        Cache topology only. Optimizer state remains lazy for grad-bearing
+        params so unused/frozen tensors do not bloat checkpoints.
+        """
+        params_tuple = tuple(group['params'])
+        cache = self._adamw_bucket_caches.get(id(group))
+        if cache is not None and cache.get('params_tuple') == params_tuple:
+            return cache['buckets']
+        buckets = {}
+        for p in params_tuple:
+            key = (p.device, p.dtype)
+            buckets.setdefault(key, {'params': []})
+            buckets[key]['params'].append(p)
+        self._adamw_bucket_caches[id(group)] = {'params_tuple': params_tuple, 'buckets': buckets}
+        return buckets
     def _step_adamw(self, group):
+        if _HYDRA_FUSED_ADAMW and _HAS_FUSED_ADAMW:
+            # Mixed CPU/CUDA groups are unusual in Feather but skipping CPU
+            # grads would be a correctness bug; disable fused path in that case.
+            if not any(p.grad is not None and not p.is_cuda for p in group['params']):
+                buckets = self._adamw_cached_buckets(group)
+                lr_f = float(group['lr'])
+                b1_f = float(group['betas'][0])
+                b2_f = float(group['betas'][1])
+                wd_f = float(group['weight_decay'])
+                eps_f = float(group['eps'])
+                launched = False
+                for (_dev, _dt), bucket in buckets.items():
+                    b_p = [p for p in bucket['params'] if p.grad is not None]
+                    if not b_p or not b_p[0].is_cuda:
+                        continue
+                    b_g = [p.grad.to(p.dtype) if p.grad.dtype != p.dtype else p.grad for p in b_p]
+                    b_ea, b_es, b_st = [], [], []
+                    for p in b_p:
+                        state = self._ensure_adamw_state(p)
+                        state['step'] += 1
+                        b_ea.append(state['exp_avg'])
+                        b_es.append(state['exp_avg_sq'])
+                        b_st.append(state['step_t'])
+                    torch._foreach_add_(b_st, 1.0)
+                    torch._fused_adamw_(
+                        b_p, b_g, b_ea, b_es,
+                        [],  # max_exp_avg_sqs unused (amsgrad=False)
+                        b_st,
+                        amsgrad=False,
+                        lr=lr_f, beta1=b1_f, beta2=b2_f,
+                        weight_decay=wd_f, eps=eps_f,
+                        maximize=False,
+                        grad_scale=None, found_inf=None,
+                    )
+                    launched = True
+                if launched:
+                    return
+        params, grads, exp_avgs, exp_avg_sqs = [], [], [], []
         for p in group['params']:
             if p.grad is None:
                 continue
+            state = self._ensure_adamw_state(p)
             state['step'] += 1
+            if 'step_t' in state:
+                state['step_t'].fill_(float(state['step']))
             params.append(p)
             grads.append(p.grad.to(p.dtype) if p.grad.dtype != p.dtype else p.grad)
             exp_avgs.append(state['exp_avg'])
             exp_avg_sqs.append(state['exp_avg_sq'])
         if not params:
             return
         # Fallback per-param path.
         self._adamw_lr_t.fill_(group['lr'])
         self._adamw_beta1_t.fill_(group['betas'][0])
                              self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t)
     def _step_muon(self, group):
+        params_tuple = tuple(group['params'])
+        cache = self._muon_params_caches.get(id(group))
+        if cache is None or cache.get('params_tuple') != params_tuple:
+            cache = {'params_tuple': params_tuple, 'params': list(params_tuple)}
+            self._muon_params_caches[id(group)] = cache
+        params_all = cache['params']
+        # Common Feather path: all Muon matrix params receive grads every step.
+        # Preserve sparse/None-grad correctness by filtering only when needed.
+        if all(p.grad is not None for p in params_all):
+            params = params_all
+        else:
+            params = [p for p in params_all if p.grad is not None]
         if not params:
             return
         p = params[0]
         state = self.state[p]
         num_params = len(params)
         shape, device, dtype = p.shape, p.device, p.dtype
+        if (
+            "momentum_buffer" not in state
+            or state["momentum_buffer"].shape[0] != num_params
+            or tuple(state["momentum_buffer"].shape[1:]) != tuple(shape)
+        ):
+            # If grad-bearing Muon params change (rare; usually all matrix params
+            # have grads), resize instead of crashing compiled Muon on a stale
+            # leading dimension. This preserves skip-None-grad semantics.
             state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
+            state.pop("second_momentum_buffer", None)
         red_dim = -1 if shape[-2] >= shape[-1] else -2
         if "second_momentum_buffer" not in state:
             # Shape must match v_mean = stacked_grads.square().mean(dim=red_dim, keepdim=True)

overlay/hydra/training.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 import sys
 import threading
 import time
-from dataclasses import asdict
 from pathlib import Path
 import torch
@@ -103,6 +103,22 @@ _CONTRASTIVE_CTX_LEN  = int(os.environ.get("HYDRA_CONTRASTIVE_CTX_LEN",  "8"))
 _CONTRASTIVE_N_PAIRS  = int(os.environ.get("HYDRA_CONTRASTIVE_N_PAIRS",  "256"))
 # ---------------------------------------------------------------------------
 # Schedules
 # ---------------------------------------------------------------------------
@@ -136,6 +152,7 @@ def save_ckpt(
     *,
     val_bpb: float | None = None,
 ) -> None:
     try:
         CACHE_DIR.mkdir(parents=True, exist_ok=True)
         payload = {
@@ -289,7 +306,22 @@ def maybe_resume_ckpt(
 def main() -> None:
     t_start = time.time()
     torch.manual_seed(SEED)
-    torch.cuda.manual_seed(SEED)
     # Precision / kernel-selection knobs for peak throughput on Ampere.
     # - high : matmul uses TF32 (Ampere's 10-bit mantissa accum) for fp32 ops
     # - allow_tf32 : explicit for both matmul + cudnn paths
@@ -299,12 +331,6 @@ def main() -> None:
     #   over the first ~100 steps. Observed 2026-04-22 and confirmed by
     #   differential profiling. Default is now FALSE; set =1 only if you
     #   see a specific workload where benchmark helps sustained tps.
-    torch.set_float32_matmul_precision("high")
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = os.environ.get("HYDRA_CUDNN_BENCHMARK", "0") == "1"
-    device = torch.device("cuda")
-    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
     # Streaming path skips prepare.py (which normally trains the tokenizer
     # and builds the retina), so we must materialize both before model init.
@@ -435,7 +461,7 @@ def main() -> None:
         )
     _train_phase("dataloader_prefetch_start")
     train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
-    if step > 0 and os.environ.get("HYDRA_RESUME_SKIP_DATALOADER", "1") == "1":
         _skip_micro_batches = step * grad_accum_steps
         print(f"[resume] fast-forwarding train stream micro_batches={_skip_micro_batches} step={step} grad_accum={grad_accum_steps}", flush=True)
         for _skip_i in range(_skip_micro_batches):
@@ -469,13 +495,11 @@ def main() -> None:
     _ASYNC_POSTPROCESS = os.environ.get("HYDRA_ASYNC_POSTPROCESS", "1") == "1"
     _som_thread: threading.Thread | None = None
     _hestia_thread: threading.Thread | None = None
-    _hestia_stream: torch.cuda.Stream | None = (
-        torch.cuda.Stream() if _ASYNC_POSTPROCESS else None
-    )
     # Hebbian retina mode — per-step on-GPU update, mutually exclusive with SOM.
     # Activated by env HYDRA_HEBBIAN_RETINA=1 (default off).
-    _HEBBIAN_RETINA = os.environ.get("HYDRA_HEBBIAN_RETINA", "0") == "1"
     _HEBBIAN_ALPHA = float(os.environ.get("HYDRA_HEBBIAN_ALPHA", "0.001"))
     _prof = os.environ.get("HYDRA_PROFILE_FORWARD", "0") == "1"
     if _HEBBIAN_RETINA:
@@ -514,6 +538,32 @@ def main() -> None:
     # default cadence) instead of every step.
     nan_flag = torch.zeros((), device=device, dtype=torch.bool)
     _first_step_marker_emitted = False
     while True:
         if not _first_step_marker_emitted:
@@ -608,18 +658,9 @@ def main() -> None:
         # A10G Hyena fallback can produce finite forward loss but non-finite
         # gradients through the guarded residual path on the next optimizer
-        # step. Scrub non-finite grad entries before clipping/stepping so one
-        # bad native-kernel backward value cannot poison the entire parameter
-        # state and create step=1 train_loss=nan.
-        # Fast GPU-native grad guard
-        if os.environ.get("HYDRA_GRAD_FINITE_GUARD", "1") == "1":
-            with torch.no_grad():
-                for p in model.parameters():
-                    if p.grad is not None:
-                        p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-        optimizer.step()
         if _prof:
             torch.cuda.synchronize(); _t_opt = time.time()

 import sys
 import threading
 import time
+from dataclasses import asdict, fields
 from pathlib import Path
 import torch
 _CONTRASTIVE_N_PAIRS  = int(os.environ.get("HYDRA_CONTRASTIVE_N_PAIRS",  "256"))
+def config_from_dict(payload: dict) -> PostSemClawConfig:
+    """Rebuild PostSemClawConfig from a checkpoint payload dict.
+    Checkpoints can contain older configs without newer dataclass fields, or
+    future configs with unknown fields.  Keep loading permissive, but normalize
+    tuple-backed topology fields so Hyena/GDN layer selections survive JSON or
+    pickle paths that turn tuples into lists.
+    """
+    field_names = {field.name for field in fields(PostSemClawConfig)}
+    kwargs = {key: value for key, value in payload.items() if key in field_names}
+    for tuple_key in ("hyena_layers", "gdn_layers"):
+        if tuple_key in kwargs and kwargs[tuple_key] is not None:
+            kwargs[tuple_key] = tuple(kwargs[tuple_key])
+    return PostSemClawConfig(**kwargs)
 # ---------------------------------------------------------------------------
 # Schedules
 # ---------------------------------------------------------------------------
     *,
     val_bpb: float | None = None,
 ) -> None:
+    global _CKPT_WORKER_THREAD
     try:
         CACHE_DIR.mkdir(parents=True, exist_ok=True)
         payload = {
 def main() -> None:
     t_start = time.time()
     torch.manual_seed(SEED)
+    device_str = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device_str)
+    if device_str == "cuda":
+        torch.cuda.manual_seed(SEED)
+        torch.set_float32_matmul_precision("high")
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cudnn.benchmark = os.environ.get("HYDRA_CUDNN_BENCHMARK", "0") == "1"
+        autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+    else:
+        # CPU path: limit BLAS threads to avoid oversubscription with data workers.
+        _cpu_threads = int(os.environ.get("HYDRA_CPU_THREADS", str(min(os.cpu_count() or 4, 8))))
+        torch.set_num_threads(_cpu_threads)
+        print(f"[CPU] torch.set_num_threads={_cpu_threads}")
+        autocast_ctx = torch.amp.autocast(device_type="cpu", dtype=torch.bfloat16, enabled=False)
     # Precision / kernel-selection knobs for peak throughput on Ampere.
     # - high : matmul uses TF32 (Ampere's 10-bit mantissa accum) for fp32 ops
     # - allow_tf32 : explicit for both matmul + cudnn paths
     #   over the first ~100 steps. Observed 2026-04-22 and confirmed by
     #   differential profiling. Default is now FALSE; set =1 only if you
     #   see a specific workload where benchmark helps sustained tps.
     # Streaming path skips prepare.py (which normally trains the tokenizer
     # and builds the retina), so we must materialize both before model init.
         )
     _train_phase("dataloader_prefetch_start")
     train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
+    if step > 0 and os.environ.get("HYDRA_RESUME_SKIP_DATALOADER", "1") != "1":
         _skip_micro_batches = step * grad_accum_steps
         print(f"[resume] fast-forwarding train stream micro_batches={_skip_micro_batches} step={step} grad_accum={grad_accum_steps}", flush=True)
         for _skip_i in range(_skip_micro_batches):
     _ASYNC_POSTPROCESS = os.environ.get("HYDRA_ASYNC_POSTPROCESS", "1") == "1"
     _som_thread: threading.Thread | None = None
     _hestia_thread: threading.Thread | None = None
+    _hestia_stream = torch.cuda.Stream() if (_ASYNC_POSTPROCESS and device.type == "cuda") else None
     # Hebbian retina mode — per-step on-GPU update, mutually exclusive with SOM.
     # Activated by env HYDRA_HEBBIAN_RETINA=1 (default off).
+    _HEBBIAN_RETINA = device.type == "cuda" and os.environ.get("HYDRA_HEBBIAN_RETINA", "0") == "1"
     _HEBBIAN_ALPHA = float(os.environ.get("HYDRA_HEBBIAN_ALPHA", "0.001"))
     _prof = os.environ.get("HYDRA_PROFILE_FORWARD", "0") == "1"
     if _HEBBIAN_RETINA:
     # default cadence) instead of every step.
     nan_flag = torch.zeros((), device=device, dtype=torch.bool)
+    # Device-step fusion surface: cache the parameter walk once and keep the
+    # finite-grad guard + clipping + optimizer launch in one compact boundary.
+    # This avoids re-materializing model.parameters() twice per optimizer step
+    # and gives the A10G path a single toggleable fused-step block without
+    # pulling dataloader/checkpoint/logging CPU control flow into Dynamo.
+    _HYDRA_FUSED_DEVICE_STEP = os.environ.get("HYDRA_FUSED_DEVICE_STEP", "1") == "1"
+    _trainable_params = tuple(model.parameters())
+    def _finish_device_step():
+        if _HYDRA_FUSED_DEVICE_STEP:
+            if os.environ.get("HYDRA_GRAD_FINITE_GUARD", "1") == "1":
+                with torch.no_grad():
+                    for _p in _trainable_params:
+                        if _p.grad is not None:
+                            _p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
+            torch.nn.utils.clip_grad_norm_(_trainable_params, max_norm=1.0)
+            optimizer.step()
+            return
+        if os.environ.get("HYDRA_GRAD_FINITE_GUARD", "1") == "1":
+            with torch.no_grad():
+                for _p in model.parameters():
+                    if _p.grad is not None:
+                        _p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
     _first_step_marker_emitted = False
     while True:
         if not _first_step_marker_emitted:
         # A10G Hyena fallback can produce finite forward loss but non-finite
         # gradients through the guarded residual path on the next optimizer
+        # step. The fused device-step boundary scrubs, clips, and launches the
+        # optimizer without re-walking model.parameters() on every substage.
+        _finish_device_step()
         if _prof:
             torch.cuda.synchronize(); _t_opt = time.time()

overlay/kernels/__init__.py ADDED Viewed

File without changes

overlay/kernels/cuda/decode_kernels.cu ADDED Viewed

	@@ -0,0 +1,10 @@

+/*
+ * CuTe DSL decode kernels for Mamba-3 autoregressive generation.
+ *
+ * Phase 2: Optimized single-token SSM step for inference.
+ * Phase 1: Not needed (training only, no generation).
+ *
+ * Fuses: input_proj + conv_step + ssm_step + output_proj
+ * into a single kernel launch for minimal latency.
+ */
+// Stub: Phase 2 implementation

overlay/kernels/cuda/flashfftconv/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

overlay/kernels/cuda/flashfftconv/README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+# flashfftconv (vendored)
+Vendored from https://github.com/HazyResearch/flash-fft-conv (Apache 2.0 license).
+**Upstream commit:** see `UPSTREAM_COMMIT`.
+## What this is
+HazyResearch's Monarch-matrix-decomposition FFT convolution CUDA kernel. Provides a
+drop-in replacement for `torch.fft.rfft + complex-mult + irfft` that runs ~2-3x
+faster than cuFFT for the specific power-of-two lengths it supports (256, 512,
+1024, 2048, 4096, 8192, ..., up to 4M).
+In HYDRA, we use it to accelerate `subsystems/hyena_pure.fftconv_ref`. The
+accelerated path is opt-in via `HYDRA_HYENA_FLASH_FFT=1`; default behavior is
+unchanged (pure PyTorch fallback).
+## How to build
+The vendored tree contains:
+- `flashfftconv/` — pure-Python wrappers (imports `monarch_cuda` CUDA extension)
+- `csrc/` — CUDA source files and setup.py for the native extension
+Build instructions:
+```bash
+cd /home/mikeb/work/feather/kernels/cuda/flashfftconv/csrc
+# Edit `csrc/setup.py` first: change the cc_flag line to match your GPU arch
+# (RTX 3060 = 8.6, A100 = 8.0, H100 = 9.0). Example for RTX 3060:
+#   cc_flag = ['--generate-code=arch=compute_86,code=compute_86']
+# Build with the local CUDA toolchain (must match your torch.version.cuda):
+CUDA_HOME=/usr/local/cuda-12.1 .venv/bin/pip install -e .
+```
+Then install the Python wrappers:
+```bash
+cd /home/mikeb/work/feather/kernels/cuda/flashfftconv
+.venv/bin/pip install -e .
+```
+## Runtime usage
+Once installed, set `HYDRA_HYENA_FLASH_FFT=1` and training will use it.
+`subsystems/hyena_pure.fftconv_ref` auto-detects via `try: import flashfftconv`
+and falls back to pure PyTorch on import failure.
+## Known caveats
+- Seqlen must be a power of 2 AND in the supported set: {256, 512, 1024, 2048,
+  4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304}.
+  For HYDRA, `fft_size = 2 * seq_len` → seq_len in {128, 256, 512, 1024, 2048, ...}.
+- dtype must be fp16 or bf16 (fp32 not supported).
+- GPU arch must be compiled into the extension (see setup.py cc_flag).
+- CUDA toolchain major.minor should match `torch.version.cuda` major (12.x ↔ 12.x).

overlay/kernels/cuda/flashfftconv/UPSTREAM_COMMIT ADDED Viewed

	@@ -0,0 +1 @@


1	+ b8771028717f46d5b22cbb8e12833f35033d621b

overlay/kernels/cuda/flashfftconv/csrc/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+*.npy
+*.json
+*.png
+*/*.npy
+*/*.json
+*/*.png
+*.DS_Store
+*/*.DS_Store

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly.h ADDED Viewed

	@@ -0,0 +1,374 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_IS_HALF_OR_BFLOAT(x) TORCH_CHECK(x.dtype() == torch::kFloat16 || x.dtype() == torch::kBFloat16, #x " must be float16 or bfloat16")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_IS_HALF_OR_BFLOAT(x)
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+std::vector<torch::Tensor> butterfly_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_padded_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_padded_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_padded_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_padded_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag);
+}
+std::vector<torch::Tensor> butterfly_gated(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(x_gate);
+    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, x_gate);
+}
+std::vector<torch::Tensor> butterfly_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag);
+}
+std::vector<torch::Tensor> butterfly_gated_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    CHECK_INPUT(x_gate);
+    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, x_gate);
+}
+torch::Tensor butterfly_ifft(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag);
+}
+torch::Tensor butterfly_ifft_gated(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(out_gate);
+    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag, out_gate);
+}
+torch::Tensor butterfly_ifft_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag);
+}
+torch::Tensor butterfly_ifft_gated_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(out_gate);
+    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, out_gate);
+}
+std::vector<torch::Tensor> butterfly_padded(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M);
+}
+std::vector<torch::Tensor> butterfly_padded_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M);
+}
+std::vector<torch::Tensor> butterfly_padded_gated(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
+}
+std::vector<torch::Tensor> butterfly_padded_gated_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
+}
+torch::Tensor butterfly_ifft_padded(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N);
+}
+torch::Tensor butterfly_ifft_padded_gated(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
+}
+torch::Tensor butterfly_ifft_padded_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N);
+}
+torch::Tensor butterfly_ifft_padded_gated_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda.cu ADDED Viewed

	@@ -0,0 +1,699 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_cuda_kernel_64(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half x_shared[];
+    half *d_f_real = &x_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            if(x_gate != nullptr){
+                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+            }else{
+                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+            out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_32(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    __shared__ half out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate == nullptr){
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+        }else{
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], __float2half(0.0f));
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]));
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+    }
+}
+__global__ void butterfly_cuda_kernel_128(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * gridDim.x * 2 + blockIdx.z * 16 * 128 * 32 * gridDim.x * 2 + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half shared_real[];
+    half *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[8][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
+            shared_real[shared_offset] = d_f[shared_offset].real();
+            shared_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__half2*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+                }else{
+                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = x[offset + idx];
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            __half2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+                }
+                wmma::store_matrix_sync(shared_real + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+                wmma::store_matrix_sync(shared_imag + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[offset + idx] = reinterpret_cast<__half2*>(shared_real)[shared_offset];
+                    out_imag[offset + idx] = reinterpret_cast<__half2*>(shared_imag)[shared_offset];
+                }
+            }
+            __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_16(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_shared[16 * 64];
+    __shared__ half d_f_real[16 * 16];
+    __shared__ half d_f_imag[16 * 16];
+    __shared__ half twiddles_real_shared[16 * 64];
+    __shared__ half twiddles_imag_shared[16 * 64];
+    __shared__ half out_real_shared[16 * 64];
+    __shared__ half out_imag_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != NULL)
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        else
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = reinterpret_cast<__half2 *>(acc_frag_real.x)[k];
+            tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<__half2 *>(acc_frag_real.x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]));
+            reinterpret_cast<__half2 *>(acc_frag_imag.x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]));
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+    }
+}
+std::vector<torch::Tensor> butterfly_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt)
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    uint N = x.size(2);
+    uint M = x.size(3);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_64<<<gridDim, blockDim, 57344>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    default:
+    printf("Not yet implemented \n");
+        break;
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,725 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 x_shared[];
+    __nv_bfloat16 *d_f_real_shared = &x_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    float *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    float2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            if(x_gate != nullptr){
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+            }else{
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], 0.0f);
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+            out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_shared[32 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    __shared__ float out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], 0.0f);
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                 for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]);
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+    }
+}
+__global__ void butterfly_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 shared_real[];
+    __nv_bfloat16 *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[8][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162 *>(shared_real)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162 *>(shared_imag)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__nv_bfloat162*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+                }else{
+                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = x[offset + idx];
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], 0.0f);
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            float2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+            __syncthreads();
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_imag[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+    }
+}
+__global__ void butterfly_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_shared[16 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[16 * 16];
+    __shared__ __nv_bfloat16 d_f_imag_shared[16 * 16];
+    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
+    __shared__ float out_real_shared[16 * 64];
+    __shared__ float out_imag_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+            d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real_shared, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag_shared, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, 0.0f);
+         wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+#pragma unroll
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real.x)[k];
+            tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<float2 *>(acc_frag_real.x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]);
+            reinterpret_cast<float2 *>(acc_frag_imag.x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]);
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+    }
+}
+std::vector<torch::Tensor> butterfly_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    uint N = x.size(2);
+    uint M = x.size(3);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+        butterfly_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    default:
+    printf("Not yet implemented \n");
+        break;
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda.cu ADDED Viewed

	@@ -0,0 +1,723 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_ifft_cuda_kernel_64(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half x_real_shared[];
+    half *x_imag_shared = &x_real_shared[N * N];
+    half *d_f_real = &x_imag_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+            reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            if(out_gate != nullptr){
+                out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+            }
+            else{
+                out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_32(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_real_shared[32 * 64];
+    __shared__ half x_imag_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = __hneg(acc_frag_real[i][j].x[k]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+        }
+        else{
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_128(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+     const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = 8;
+    const int n = 16;
+    extern __shared__ half real_shared[];
+    half *imag_shared = &real_shared[128 * 128];
+    half *real_shared_2 = &imag_shared[128 * 128];
+    half *imag_shared_2 = &real_shared_2[128 * 128];
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag[8][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
+            real_shared_2[shared_offset] = d_f[shared_offset].real();
+            imag_shared_2[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__half2*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__half2*>(real_shared)[shared_offset] = x_real[offset + idx];
+                reinterpret_cast<__half2*>(imag_shared)[shared_offset] = x_imag[offset + idx];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements/2; k++)
+            {
+                tmp_real = __hsub2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]),
+                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]));
+                tmp_imag = __hadd2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]),
+                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]));
+                reinterpret_cast<__half2*>(b_frag_real[j].x)[k] = tmp_real;
+                reinterpret_cast<__half2*>(b_frag_imag[j].x)[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(out_gate != nullptr){
+                    out_real[offset + idx] = __hmul2(reinterpret_cast<__half2*>(real_shared)[shared_offset], out_gate[offset + idx]);
+                }
+                else{
+                    out_real[offset + idx] = reinterpret_cast<__half2*>(real_shared)[shared_offset];
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_16(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+   const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_real_shared[16 * 64];
+    __shared__ half x_imag_shared[16 * 64];
+    __shared__ half d_f_real[16 * 16];
+    __shared__ half d_f_imag[16 * 16];
+    __shared__ half twiddles_real_shared[16 * 64];
+    __shared__ half twiddles_imag_shared[16 * 64];
+    __shared__ half out_real_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    //check if it is better to have one warp do all the multiplication or split between warps
+    if (threadIdx.y < 4)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = __hneg(acc_frag_real.x[k]);
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+        }
+        else{
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+    }
+}
+torch::Tensor butterfly_ifft_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt)
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    uint N = x_real.size(2);
+    uint M = x_real.size(3);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
+    gridDim.z = H;
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_ifft_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_ifft_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_ifft_cuda_kernel_64<<<gridDim, blockDim, 8 * N * N * sizeof(half)>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536*2);
+        butterfly_ifft_cuda_kernel_128<<<gridDim, blockDim, 65536*2>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    default:
+        printf("Not implemented\n");
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,705 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_ifft_bf16_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 x_real_shared[];
+    __nv_bfloat16 *x_imag_shared = &x_real_shared[N * N];
+    __nv_bfloat16 *d_f_real_shared = &x_imag_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+            reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            if(out_gate != nullptr){
+                out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]); ;
+            }else{
+                out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = - acc_frag_real[i][j].x[k];
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
+        }else{
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 real_shared[];
+    __nv_bfloat16 *imag_shared = &real_shared[128 * 128];
+    __nv_bfloat16 *real_shared_2 = &imag_shared[128 * 128];
+    __nv_bfloat16 *imag_shared_2 = &real_shared_2[128 * 128];
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag[8][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared_2)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared_2)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = x_real[offset + idx];
+                reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = x_imag[offset + idx];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < 8; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 8; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(reinterpret_cast<float*>(real_shared) + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(out_gate != nullptr){
+                    out_real[offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]), out_gate[offset + idx]);
+                }else{
+                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]);
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
+    __shared__ float out_real_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = - acc_frag_real.x[k];
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
+        }else{
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+    }
+}
+torch::Tensor butterfly_ifft_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    uint N = x_real.size(2);
+    uint M = x_real.size(3);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+     case 16:
+        gridDim.z = H;
+        butterfly_ifft_bf16_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        gridDim.z = H;
+        butterfly_ifft_bf16_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+        butterfly_ifft_bf16_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+        butterfly_ifft_bf16_cuda_kernel_128<<<gridDim, blockDim, 65536 * 2>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    default:
+        printf("Not implemented\n");
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda.cu ADDED Viewed

	@@ -0,0 +1,871 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cmath>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int K>
+__global__ void butterfly_padded_cuda_kernel_64(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ half x_shared[];
+    half *d_f_real = &x_shared[K * 16 * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[K][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        t_offset = t * M/2;
+        out_t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+                shared_offset = i * 32 + threadIdx.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2half2_rn(0.0f, 0.0f);
+                }
+                else{
+                    reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2half2_rn(0.0f, 0.0f);
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+            }
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            out_real[out_offset + out_t_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[shared_offset];
+            out_imag[out_offset + out_t_offset + idx] = reinterpret_cast<__half2 *>(out_imag_shared)[shared_offset];
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_padded_cuda_kernel_128(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ half shared_real[];
+    half *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[K][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[8];
+    for (int i = threadIdx.y ; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = i * 128 + threadIdx.x + j * blockDim.x;
+            shared_real[shared_offset] = d_f[shared_offset].real();
+            shared_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(shared_real)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__half2*>(shared_imag)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        t_offset = t * M/2;
+        out_t_offset = t * 128 * 32 * 2 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    if(x_gate != nullptr){
+                        reinterpret_cast<__half2*>(shared_real)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2half2_rn(0.0f, 0.0f);
+                    }
+                    else{
+                        reinterpret_cast<__half2*>(shared_real)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2half2_rn(0.0f, 0.0f);
+                    }
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            __half2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(shared_real + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+                wmma::store_matrix_sync(shared_imag + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = threadIdx.y; i < N; i+=blockDim.y)
+            {
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[idx + out_offset + out_t_offset] = reinterpret_cast<__half2*>(shared_real)[shared_offset];
+                    out_imag[idx + out_offset + out_t_offset] = reinterpret_cast<__half2*>(shared_imag)[shared_offset];
+                }
+            }
+            __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_padded_cuda_kernel_32(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 32;
+    __shared__ half x_shared[K * 16 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    __shared__ half out_imag_shared[32 * 64];
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    for(int i = threadIdx.y; i<32; i+=blockDim.y){
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        if(i < K * 16){
+            if(x_gate != nullptr){
+                reinterpret_cast<__half2*>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[offset  + idx], x_gate[offset  + idx]) : __floats2half2_rn(0.0f, 0.0f);
+            }
+            else{
+                reinterpret_cast<__half2*>(x_shared)[shared_offset] = idx < max_idx ? x[offset  + idx] : __floats2half2_rn(0.0f, 0.0f);
+            }
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[K][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                if(i<K){
+                    wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                }
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], __float2half(0.0f));
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]));
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+    // int idx = offset + threadIdx.y * 32 + blockIdx.x * 32 + threadIdx.x;
+    for(int i = threadIdx.y; i<32; i+=blockDim.y){
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        out_real[out_offset + idx] = reinterpret_cast<__half2*>(out_real_shared)[i * 32 + threadIdx.x];
+        out_imag[out_offset + idx] = reinterpret_cast<__half2*>(out_imag_shared)[i * 32 + threadIdx.x];
+    }
+}
+__global__ void butterfly_padded_cuda_kernel_16(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ half x_shared[N * 64];
+    __shared__ half d_f_real[N * N];
+    __shared__ half d_f_imag[N * N];
+    __shared__ half twiddles_real_shared[N * 64];
+    __shared__ half twiddles_imag_shared[N * 64];
+    __shared__ half out_real_shared[N * 64];
+    __shared__ half out_imag_shared[N * 64];
+    // #pragma unroll
+  for(int i = threadIdx.y; i<N; i+=blockDim.y){
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        if(x_gate != NULL){
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset], x_gate[idx + offset]) : __floats2half2_rn(0.0f, 0.0f);
+        }
+        else{
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset] : __floats2half2_rn(0.0f, 0.0f);
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = i * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = reinterpret_cast<__half2 *>(acc_frag_real.x)[k];
+            tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<__half2 *>(acc_frag_real.x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]));
+            reinterpret_cast<__half2 *>(acc_frag_imag.x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]));
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i<N; i+=blockDim.y)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        out_real[out_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[i * 32 + threadIdx.x];
+        out_imag[out_offset +  idx] = reinterpret_cast<__half2 *>(out_imag_shared)[i * 32 + threadIdx.x];
+    }
+}
+std::vector<torch::Tensor> butterfly_padded_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    uint N = x.size(2);
+    uint d_f_size = d_f.size(1);
+    //need to make sure that N is less that the M to which we are padding
+    assert(N <= d_f_size * M);
+    // printf("B: %d, H: %d, N: %d\n", B, H, N);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out_real = torch::empty({B, H, d_f_size * M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, d_f_size * M}, x.options());
+    gridDim.x = 512 / (32 * 1024/ M);
+    const int K = ceil(N / (1.0 * 16 * M));
+    switch(d_f_size){
+        case 16:
+            butterfly_padded_cuda_kernel_16<<<gridDim, blockDim>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+        case 32:
+            switch (K)
+            {
+                case 1:
+                    butterfly_padded_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 2:
+                    butterfly_padded_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                            static_cast<__half2 *>(x.data_ptr()),
+                            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                            static_cast<complex_half_t *>(d_f.data_ptr()),
+                            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                            static_cast<__half2 *>(out_real.data_ptr()),
+                            static_cast<__half2 *>(out_imag.data_ptr()),
+                            B,
+                            H,
+                            N);
+                    break;
+                default:
+                    printf("Invalid K, df size 32: %d\n", K);
+            }
+            break;
+        case 64:
+            gridDim.z = H / 16;
+            switch (K)
+            {
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<1><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<2><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<3><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_64<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_64<4><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                    N);
+                    break;
+                default:
+                    printf("Invalid K, df size 64: %d\n", K);
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ M);
+            gridDim.z = H / 16;
+            switch(K){
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<1><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<2><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<3><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<4><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 5:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<5><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 6:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<6><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 7:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<7><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 8:
+                    cudaFuncSetAttribute(&butterfly_padded_cuda_kernel_128<8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_padded_cuda_kernel_128<8><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__half2 *>(x.data_ptr()),
+                        x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<complex_half_t *>(d_f.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__half2 *>(out_real.data_ptr()),
+                        static_cast<__half2 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                default:
+                    printf("Invalid K, df size 128: %d\n", K);
+            }
+            break;
+        default:
+            printf("Invalid d_f size: %d\n", d_f_size);
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,897 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int K>
+__global__ void butterfly_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ __nv_bfloat16 x_shared[];
+    __nv_bfloat16 *d_f_real_shared = &x_shared[K * 16 * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    float *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    float2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        t_offset = t * M/2;
+        out_t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+                shared_offset = i * 32 + threadIdx.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+                }else{
+                    reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], 0.0f);
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+            for (int k = 0; k < K; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            out_real[out_offset + out_t_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[shared_offset]);
+            out_imag[out_offset + out_t_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[shared_offset]);
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int N  = 32;
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    __shared__ __nv_bfloat16 x_shared[K * 16 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    __shared__ float out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i<32; i+=blockDim.y)
+    {
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        if(i < K * 16){
+            if(x_gate != nullptr){
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset], x_gate[idx + offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+            }else{
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+            }
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[K][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                if(i < K){
+                    wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                }
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                 for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]);
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i<32; i+=blockDim.y)
+    {
+        int idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        out_real[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[i * 32 + threadIdx.x]);
+        out_imag[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[i * 32 + threadIdx.x]);
+    }
+}
+template <int K>
+__global__ void butterfly_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int offset = blockIdx.y * H * M/2 + blockIdx.z * 16 *  M/2;
+    const int out_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ __nv_bfloat16 shared_real[];
+    __nv_bfloat16 *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[K][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[8];
+    for (int i = threadIdx.y ; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162 *>(shared_real)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162 *>(shared_imag)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__nv_bfloat162*>(shared_imag)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        t_offset = t * M/2;
+        out_t_offset = t * 128 * 32 * 2 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            if(i < K * 16){
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    if(x_gate != nullptr){
+                        reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = idx < max_idx ?  __hmul2(x[idx + offset + t_offset], x_gate[idx + offset + t_offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+                    }else{
+                        reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = idx < max_idx ? x[idx + offset + t_offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+                    }
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+                for (int k = 0; k < K; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            float2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = threadIdx.y; i < N; i+=blockDim.y)
+            {
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[idx + out_offset + out_t_offset] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+            __syncthreads();
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = threadIdx.y; i < N; i+=blockDim.y)
+            {
+                for(int j=0; j< 2; j++){
+                    idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                    shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                    out_imag[idx + out_offset + out_t_offset] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+    }
+}
+template<int K>
+__global__ void butterfly_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int out_offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ __nv_bfloat16 x_shared[N * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[N * N];
+    __shared__ __nv_bfloat16 d_f_imag_shared[N * N];
+    __shared__ __nv_bfloat16 twiddles_real_shared[N * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[N * 64];
+    __shared__ float out_real_shared[N * 64];
+    __shared__ float out_imag_shared[N * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? __hmul2(x[idx + offset], x_gate[idx + offset]) : __floats2bfloat162_rn(0.0f, 0.0f);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = idx < max_idx ? x[idx + offset] : __floats2bfloat162_rn(0.0f, 0.0f);
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = i * 16 + threadIdx.x;
+            d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+            d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real_shared, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag_shared, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, 0.0f);
+         wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+#pragma unroll
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real.x)[k];
+            tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<float2 *>(acc_frag_real.x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]);
+            reinterpret_cast<float2 *>(acc_frag_imag.x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]);
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;;
+        out_real[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[i * 32 + threadIdx.x]);
+        out_imag[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[i * 32 + threadIdx.x]);
+    }
+}
+std::vector<torch::Tensor> butterfly_padded_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    uint d_f_size = d_f_real.size(1);
+    uint N = x.size(2);
+    //need to make sure that N is less that the M to which we are padding
+    assert(N <= d_f_size * M);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out_real = torch::empty({B, H, d_f_size * M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, d_f_size * M}, x.options());
+    gridDim.x = 512 / (32 * 1024/ M);
+    const int K = ceil(N / (1.0 * 16 * M));
+    switch (d_f_size)
+    {
+        case 16:
+            butterfly_cuda_kernel_16<1><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+        case 32:
+            switch(K){
+                case 1:
+                    butterfly_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 2:
+                    butterfly_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                default:
+                    printf("Invalid K, df size 32: %d\n", K);
+            }
+            break;
+        case 64:
+            gridDim.z = H / 16;
+            switch(K){
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<1><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<2><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<3><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_64<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+                    butterfly_cuda_kernel_64<4><<<gridDim, blockDim, 78000>>>(
+                    static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                    x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                    B,
+                    H,
+                    N);
+                    break;
+                default:
+                    printf("Invalid K, df size 64: %d\n", K);
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ M);
+            gridDim.z = H / 16;
+            switch(K){
+                case 1:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<1><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                        break;
+                case 2:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<2><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 3:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<3><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 4:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<4><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 5:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<5><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 6:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<6><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 7:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<7><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                case 8:
+                    cudaFuncSetAttribute(&butterfly_cuda_kernel_128<8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                    butterfly_cuda_kernel_128<8><<<gridDim, blockDim, 65536>>>(
+                        static_cast<__nv_bfloat162 *>(x.data_ptr()),
+                        x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+                        static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                        static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+                        B,
+                        H,
+                        N);
+                    break;
+                default:
+                    printf("Invalid K, df size 128: %d\n", K);
+            }
+            break;
+        default:
+        printf("Not yet implemented \n");
+            break;
+    }
+    return {out_real, out_imag};
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda.cu ADDED Viewed

	@@ -0,0 +1,905 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_64(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H * M/2;
+    const int in_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * TILE_H * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ half x_real_shared[];
+    half *x_imag_shared = &x_real_shared[N * N];
+    half *d_f_real = &x_imag_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[K][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[K][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[K];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        if(i < K){
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+            }
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+            reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            if(idx < max_idx){
+                if(out_gate != nullptr)
+                    out_real[out_offset + out_t_offset + idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[shared_offset], out_gate[out_offset + out_t_offset + idx]);
+                else
+                    out_real[out_offset + out_t_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[shared_offset];
+            }
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_32(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 32;
+    int idx;
+    int shared_offset;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int in_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    __shared__ half x_real_shared[32 * 64];
+    __shared__ half x_imag_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[in_offset  + idx];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[in_offset  + idx];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N/16)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[K][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[K][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[K][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                if(i < K){
+                    wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                    wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                }
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = __hneg(acc_frag_real[i][j].x[k]);
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[idx +  out_offset] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[shared_offset], out_gate[idx +  out_offset]);
+            }else{
+                out_real[idx +  out_offset] = reinterpret_cast<__half2 *>(out_real_shared)[shared_offset];
+            }
+        }
+    }
+}
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_128(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H *  M/2;
+    const int in_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * TILE_H * 128 * 32 *  2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ half real_shared[];
+    half *imag_shared = &real_shared[128 * 128];
+    half *real_shared_2 = &imag_shared[128 * 128];
+    half *imag_shared_2 = &real_shared_2[128 * 128];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag[K][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[K];
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = i * 128 + threadIdx.x + j * blockDim.x;
+            real_shared_2[shared_offset] = d_f[shared_offset].real();
+            imag_shared_2[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(real_shared)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__half2*>(imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 128 * 32 * 2  * gridDim.x;
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__half2*>(real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+                reinterpret_cast<__half2*>(imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                if(idx < max_idx){
+                    if(out_gate != nullptr){
+                        out_real[idx + out_offset + out_t_offset] = __hmul2(reinterpret_cast<__half2*>(real_shared)[shared_offset], out_gate[idx + out_offset + out_t_offset]);
+                    }else{
+                        out_real[idx + out_offset + out_t_offset] = reinterpret_cast<__half2*>(real_shared)[shared_offset];
+                    }
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_padded_cuda_kernel_16(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ half x_real_shared[N * 64];
+    __shared__ half x_imag_shared[N * 64];
+    __shared__ half d_f_real[N * N];
+    __shared__ half d_f_imag[N * N];
+    __shared__ half twiddles_real_shared[N * 64];
+    __shared__ half twiddles_imag_shared[N * 64];
+    __shared__ half out_real_shared[N * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        if(threadIdx.x  < 16 ){
+            shared_offset = i * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    //check if it is better to have one warp do all the multiplication or split between warps
+    if (threadIdx.y < 4)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = __hneg(acc_frag_real.x[k]);
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[out_offset + idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[i * 32 + threadIdx.x], out_gate[out_offset + idx]);
+            }
+            else{
+                out_real[out_offset + idx] = reinterpret_cast<__half2 *>(out_real_shared)[i * 32 + threadIdx.x];
+            }
+        }
+    }
+}
+torch::Tensor butterfly_ifft_padded_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int fft_size,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    uint N_M = x_real.size(2);
+    const int d_f_size = d_f.size(0);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    // uint N = x_real.size(2);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    gridDim.x = 512 / (32 * 1024/ (N_M / d_f_size));
+    gridDim.z = H;
+    const int TILE_H = 16;
+    torch::Tensor out_real = torch::empty({B, H, fft_size}, x_real.options());
+    const int K = ceil(fft_size / (1.0 * 16 * (N_M / d_f_size)));
+    switch(d_f_size){
+        case 16:
+            butterfly_ifft_padded_cuda_kernel_16<<<gridDim, blockDim>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+            break;
+        case 32:
+            switch (K)
+            {
+            case 1:
+                butterfly_ifft_padded_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            case 2:
+                butterfly_ifft_padded_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        case 64:
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                break;
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ (N_M / d_f_size));
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 5:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 6:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 7:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 8:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__half2 *>(x_real.data_ptr()),
+                    static_cast<__half2 *>(x_imag.data_ptr()),
+                    static_cast<complex_half_t *>(d_f.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__half2 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        default:
+            printf("Invalid d_f_size: %d\n", d_f_size);
+            break;
+    }
+    return out_real;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_padded_ifft_cuda_bf16.cu ADDED Viewed

	@@ -0,0 +1,917 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H * M/2;
+    const int in_offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * TILE_H * 64 * 32 * gridDim.x;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    const int N = 64;
+    extern __shared__ __nv_bfloat16 x_real_shared[];
+    __nv_bfloat16 *x_imag_shared = &x_real_shared[N * N];
+    __nv_bfloat16 *d_f_real_shared = &x_imag_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[K][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[K][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[K];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] =  d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] =  d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        if(i < K){
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+            }
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 64 * 32 * gridDim.x;
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+            reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+            shared_offset = i * 32 + threadIdx.x;
+            if(idx < max_idx){
+                if(out_gate != nullptr)
+                    out_real[out_offset + out_t_offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]), out_gate[out_offset + out_t_offset + idx]);
+                else
+                    out_real[out_offset + out_t_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]);
+            }
+        }
+        __syncthreads();
+    }
+}
+template <int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 32;
+    int idx;
+    int shared_offset;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int in_offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x;
+    __shared__ __nv_bfloat16 x_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[32 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        int shared_offset = i * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[in_offset  + idx];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[in_offset  + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        // #pragma unroll
+        shared_offset = i * 32 + threadIdx.x;
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N/16)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[K][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[K][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[K][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                if(i < K){
+                    wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                    wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                }
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = - acc_frag_real[i][j].x[k];
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        idx = i * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+        shared_offset = i * 32 + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[idx +  out_offset] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]), out_gate[idx +  out_offset]);
+            }else{
+                out_real[idx +  out_offset] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[shared_offset]);
+            }
+        }
+    }
+}
+template <int TILE_H, int K>
+__global__ void butterfly_ifft_padded_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162  *__restrict__ d_f_real,
+    const __nv_bfloat162  *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int out_offset = blockIdx.y * H * M/2 + blockIdx.z * TILE_H *  M/2;
+    const int in_offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * TILE_H * 128 * 32 *  2 * gridDim.x;
+    const int N = 128;
+    int idx;
+    int t_offset;
+    int out_t_offset;
+    int shared_offset;
+    extern __shared__ __nv_bfloat16 real_shared[];
+    __nv_bfloat16 *imag_shared = &real_shared[128 * 128];
+    __nv_bfloat16 *real_shared_2 = &imag_shared[128 * 128];
+    __nv_bfloat16 *imag_shared_2 = &real_shared_2[128 * 128];
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag[K][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[K];
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared_2)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared_2)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    for (int i = threadIdx.y; i < N; i+=blockDim.y)
+    {
+        for(int j=0; j< 2; j++){
+            idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+            shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = twiddle_factors_real[idx];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    for (int t = 0; t < TILE_H; t++)
+    {
+        out_t_offset = t * M/2;
+        t_offset = t * 128 * 32 * 2  * gridDim.x;
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = x_real[idx + in_offset + t_offset];
+                reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = x_imag[idx + in_offset + t_offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        __syncthreads();
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < K; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = -acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < K; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < K; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < K; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(reinterpret_cast<float*>(real_shared) + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = threadIdx.y; i < N; i+=blockDim.y)
+        {
+            for(int j=0; j< 2; j++){
+                idx = i * 32 * 2 * gridDim.x + j * blockDim.x + blockIdx.x * 64 + threadIdx.x;
+                shared_offset = i * 64 + threadIdx.x + j * blockDim.x;
+                if(idx < max_idx){
+                    if(out_gate != nullptr){
+                        out_real[idx + out_offset + out_t_offset] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]), out_gate[idx + out_offset + out_t_offset]);
+                    }else{
+                        out_real[idx + out_offset + out_t_offset] = __float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]);
+                    }
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_padded_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int M)
+{
+    const int max_idx = M / 2; //actually should be -1 since indices are 0-based but we are using < instead of <=
+    const int N  = 16;
+    const int out_offset  =  blockIdx.y * H * M / 2 + blockIdx.z * M / 2;
+    const int offset = blockIdx.y * H * N * blockDim.x * gridDim.x + blockIdx.z * N * blockDim.x * gridDim.x;
+    __shared__ __nv_bfloat16 x_real_shared[N * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[N * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[N * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[N * 64];
+    __shared__ float out_real_shared[N * 64];
+    // #pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+        int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        int shared_offset = i * blockDim.x + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = - acc_frag_real.x[k];
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = threadIdx.y; i < N; i++)
+    {
+       int idx = i * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+        if(idx < max_idx){
+            if(out_gate != nullptr){
+                out_real[out_offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[i * 32 + threadIdx.x]), out_gate[out_offset + idx]);
+            }else{
+                out_real[out_offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[i * 32 + threadIdx.x]);
+            }
+        }
+    }
+}
+torch::Tensor butterfly_ifft_padded_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int fft_size,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    uint N_M = x_real.size(2);
+    const int d_f_size = d_f_real.size(0);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    // uint N = x_real.size(2);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    gridDim.x = 512 / (32 * 1024/ (N_M / d_f_size));
+    gridDim.z = H;
+    const int TILE_H = 16;
+    torch::Tensor out_real = torch::empty({B, H, fft_size}, x_real.options());
+    const int K = ceil(fft_size / (1.0 * 16 * (N_M / d_f_size)));
+    switch(d_f_size){
+        case 16:
+            butterfly_ifft_padded_cuda_kernel_16<<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+        case 32:
+            switch (K)
+            {
+            case 1:
+                butterfly_ifft_padded_cuda_kernel_32<1><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            case 2:
+                butterfly_ifft_padded_cuda_kernel_32<2><<<gridDim, blockDim>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size
+                );
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        case 64:
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 1><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 2><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 3><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+                butterfly_ifft_padded_cuda_kernel_64<TILE_H, 4><<<gridDim, blockDim, 65536>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                break;
+            }
+            break;
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            gridDim.x = 256 / (32 * 1024/ (N_M / d_f_size));
+            gridDim.z = H / TILE_H;
+            switch (K)
+            {
+            case 1:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 1><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 2:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 2><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 3:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 3><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 4:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 4><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 5:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 5><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 6:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 6><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 7:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 7><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            case 8:
+                cudaFuncSetAttribute(&butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8>, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+                butterfly_ifft_padded_cuda_kernel_128<TILE_H, 8><<<gridDim, blockDim, 65536 * 2>>>(
+                    static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+                    static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+                    out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+                    B,
+                    H,
+                    fft_size);
+                break;
+            default:
+                printf("Invalid K: %d\n", K);
+                break;
+            }
+            break;
+        default:
+            printf("Invalid d_f_size: %d\n", d_f_size);
+            break;
+    }
+    return out_real;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/shared.h ADDED Viewed

	@@ -0,0 +1,60 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+using namespace nvcuda;
+using complex_half_t = typename c10::complex<at::Half>;
+using complex_bhalf_t = typename c10::complex<at::BFloat16>;
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+#define WARP_SIZE 32
+#ifndef MONARCH_CUDA_H_
+#define MONARCH_CUDA_H_
+__device__ __forceinline__ float2
+operator+( float2 lhs, float2 rhs)
+{
+    float2 res = { lhs.x + rhs.x , lhs.y + rhs.y };
+    return res;
+}
+__device__ __forceinline__ float2
+operator-( float2 lhs, float2 rhs)
+{
+    float2 res = { lhs.x - rhs.x , lhs.y - rhs.y };
+    return res;
+}
+__device__ __forceinline__ float2
+operator*( float2 lhs, float2 rhs)
+{
+    float2 res = { lhs.x * rhs.x , lhs.y * rhs.y };
+    return res;
+}
+#endif

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d.h ADDED Viewed

	@@ -0,0 +1,96 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_IS_HALF_OR_BFLOAT_OR_FLOAT(x) TORCH_CHECK(x.dtype() == torch::kFloat16 || x.dtype() == torch::kBFloat16 || x.dtype() == torch::kFloat32, #x " must be float16 or bfloat16 or float32")
+#define CHECK_SAME_TYPE(x, y) TORCH_CHECK(x.dtype() == y.dtype(), #x " and " #y " must have the same dtype")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_IS_HALF_OR_BFLOAT_OR_FLOAT(x)
+torch::Tensor conv1d_cuda_bhl(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding);
+torch::Tensor conv1d_cuda_blh(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding);
+std::vector<torch::Tensor> conv1d_backward_bhl_cuda(
+    torch::Tensor dout,
+    torch::Tensor input,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding
+);
+std::vector<torch::Tensor> conv1d_backward_blh_cuda(
+    torch::Tensor dout,
+    torch::Tensor input,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding
+);
+torch::Tensor conv1d_fwd(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding,
+    bool is_bhl)
+{
+    CHECK_INPUT(u);
+    CHECK_INPUT(weight);
+    CHECK_INPUT(bias);
+    CHECK_SAME_TYPE(weight, bias);
+    int k;
+    if(is_bhl){
+        k = weight.size(1);
+    }else{
+        k = weight.size(0);
+    }
+    TORCH_CHECK(k % 2 == 1, "Filter size must be odd number");
+    if(is_bhl){
+        return conv1d_cuda_bhl(u, weight, bias, padding);
+    }else{
+        return conv1d_cuda_blh(u, weight, bias, padding);
+    }
+}
+std::vector<torch::Tensor> conv1d_bwd(
+    torch::Tensor dout,
+    torch::Tensor input,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding,
+    bool is_bhl)
+{
+    CHECK_INPUT(dout);
+    CHECK_INPUT(input);
+    CHECK_INPUT(weight);
+    CHECK_INPUT(bias);
+    CHECK_SAME_TYPE(weight, bias);
+    CHECK_SAME_TYPE(dout, input);
+    if(is_bhl){
+        return conv1d_backward_bhl_cuda(dout, input, weight, bias, padding);
+    } else{
+        return conv1d_backward_blh_cuda(dout, input, weight, bias, padding);
+    }
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bhl.cu ADDED Viewed

	@@ -0,0 +1,132 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+// Simple 1D depthwise convolution implementation with dilation and stride = 1
+#include "shared.h"
+const uint BX = 256;
+const uint BY = 1;
+const uint BZ = 1;
+const uint TILE_SIZE_L = 4;
+const uint TILE_SIZE_D = 1;
+template<typename T, typename U>
+__forceinline__ __device__ T _conv1d_k_3(const T* u, const U* weights, const U* bias, uint padding, uint l, uint d, uint L, uint D, uint K)
+{
+    T tmp;
+    T weight;
+    set_value(&tmp, bias[d]);
+    int idx = l - padding;
+    if(idx >= 0 && idx < L){
+        set_value(&weight, weights[0]);
+        tmp = __hfma(u[d * L + idx], weight, tmp);
+    }
+    idx++;
+    if(idx >= 0 && idx < L){
+        set_value(&weight, weights[1]);
+        tmp = __hfma(u[d * L + idx], weight, tmp);
+    }
+    idx++;
+    if(idx >= 0 && idx < L){
+        set_value(&weight, weights[2]);
+        tmp = __hfma(u[d * L + idx], weight, tmp);
+    }
+    return tmp;
+}
+template<typename T, typename U>
+__global__ void conv1d_kernel(
+    const T *__restrict__ u,
+    const U *__restrict__ weights,
+    const U *__restrict__ bias,
+    T *__restrict__ out,
+    uint padding,
+    uint B,
+    uint L,
+    uint D,
+    uint K,
+    uint L_out
+    )
+{
+    const int b = blockIdx.z * blockDim.z + threadIdx.z;
+    const int d = blockIdx.y * blockDim.y * TILE_SIZE_D + threadIdx.y;
+    const int l_offset = blockIdx.x * blockDim.x * TILE_SIZE_L + threadIdx.x;
+    T tmp;
+    T weight;
+    int idx;
+    int l;
+    for(int l_tile = 0; l_tile < TILE_SIZE_L; l_tile++){
+        l = l_offset + l_tile * blockDim.x;
+        set_value(&tmp, bias[d]);
+        if(d < D && l < L_out && b < B){
+            if(K == 3){
+                out[b * L_out * D + d * L_out + l] = _conv1d_k_3(u + b * L * D, weights + d * K, bias, padding, l, d, L, D, K);
+            } else{
+                for(int k = 0; k < K; k++){
+                    idx = l - padding + k;
+                    if(idx >= 0 && idx < L){
+                        set_value(&weight, weights[d * K + k]);
+                        tmp = __hfma(u[b * L_out * D + d * L + idx], weight, tmp);
+                    }
+                }
+                out[b * L_out * D + d * L_out + l] = tmp;
+            }
+        }
+    }
+}
+torch::Tensor conv1d_cuda_bhl(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint d = u.size(1);
+    const uint l = u.size(2);
+    const uint k = weight.size(1);
+    uint l_out = (l + 2 * padding - k + 1);
+    dim3 blockDims(BX, BY, BZ);
+    dim3 gridDims(ceil(l_out * 1.0 / (BX * TILE_SIZE_L) ), ceil((d * 1.0) / (BY * TILE_SIZE_D)), ceil((b * 1.0) / BZ));
+    torch::Tensor out = torch::empty({b, d, l_out}, u.options());
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d fwd bhl",
+        ([&]
+            { conv1d_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<weight_t *>(bias.data_ptr()),
+                    static_cast<input_t *>(out.data_ptr()),
+                    padding,
+                    b,
+                    l,
+                    d,
+                    k,
+                    l_out
+                    );
+            }
+        )
+    );
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_blh.cu ADDED Viewed

	@@ -0,0 +1,202 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+// Simple 1D depthwise convolution implementation with dilation and stride = 1
+#include "shared.h"
+//For max perf, tune for your GPU and batch size, and datatype etc
+const uint BX = 512;
+const uint BY = 1;
+const uint BZ = 1;
+const uint TILE_SIZE_Y = 4;
+const uint TILE_SIZE_X = 2;
+// Trick to do padding in place without actually creating a new tensor
+__forceinline__ __device__ __half2 get_u(const __half2 *__restrict__ u, uint L_eff, uint l, uint p, uint b, uint k, uint d, uint L, uint D, uint K)
+{
+    return l + k < p || l + k > L_eff - (p + 1) ? __float2half2_rn(0.0f) : u[b * L * D + (l + k - p) * D + d];
+}
+__forceinline__ __device__ __nv_bfloat162 get_u(const __nv_bfloat162 *__restrict__ u, uint L_eff, uint l, uint p, uint b, uint k, uint d, uint L, uint D, uint K)
+{
+    return l + k < p || l + k > L_eff - (p + 1) ? __float2bfloat162_rn(0.0f) : u[b * L * D + (l + k - p) * D + d];
+}
+__forceinline__ __device__ float2 get_u(const float2 *__restrict__ u, uint L_eff, uint l, uint p, uint b, uint k, uint d, uint L, uint D, uint K)
+{
+    return l + k < p || l + k > L_eff - (p + 1) ? make_float2(0.0f, 0.0f) : u[b * L * D + (l + k - p) * D + d];
+}
+//manually unrolling loop for k = 3 leads to good perf, can easily extend for other values of k if need be
+template<typename T, typename U>
+__forceinline__ __device__ T _conv1d_k_3(const T* u, const U* weights, const U* bias, T* out, uint padding, uint b, uint l, uint d, uint t, uint L, uint D, uint K, uint L_eff, uint L_out)
+{
+    T tmp;
+    T weight;
+    set_value(&tmp, bias[d]);
+    set_value(&weight, weights[0 * D + d]);
+    tmp = __hfma2(get_u(u, L_eff, l + t, padding, b, 0, d, L, D, K), weight, tmp);
+    set_value(&weight, weights[1 * D + d]);
+    tmp = __hfma2(get_u(u, L_eff, l + t, padding, b, 1, d, L, D, K), weight, tmp);
+    set_value(&weight, weights[2 * D + d]);
+    out[b * D * L_out  + (l + t) * D + d] = __hfma2(get_u(u, L_eff, l + t, padding, b, 2, d, L, D, K), weight, tmp);
+}
+template<typename T, typename U>
+__global__ void conv1d_kernel_k_3(
+    const T *__restrict__ u,
+    const U *__restrict__ weights,
+    const U *__restrict__ bias,
+    T *__restrict__ out,
+    uint padding,
+    uint B,
+    uint L,
+    uint L_out,
+    uint L_eff,
+    uint D,
+    uint K)
+{
+    const int d_block = blockIdx.x * blockDim.x * TILE_SIZE_X;
+    const int l = blockIdx.y * blockDim.y * TILE_SIZE_Y + threadIdx.y * TILE_SIZE_Y;
+    const int b = blockIdx.z * blockDim.z + threadIdx.z;
+    int d;
+    #pragma unroll
+    for (int i = 0; i < TILE_SIZE_X; i++)
+    {
+        d = d_block + threadIdx.x + i * BX;
+        if (d < D && b < B){
+            #pragma unroll
+            for (int t = 0; t < TILE_SIZE_Y; t++){
+                if (l + t < L_eff - K + 1)
+                {
+                    _conv1d_k_3(u, weights, bias, out, padding, b, l, d, t, L, D, K, L_eff, L_out);
+                }
+            }
+        }
+    }
+}
+template<typename T, typename U>
+__global__ void conv1d_kernel(
+    const T *__restrict__ u,
+    const U *__restrict__ weights,
+    const U *__restrict__ bias,
+    T *__restrict__ out,
+    uint padding,
+    uint B,
+    uint L,
+    uint L_out,
+    uint L_eff,
+    uint D,
+    uint K)
+{
+    const int d_block = blockIdx.x * blockDim.x * TILE_SIZE_X;
+    const int l = blockIdx.y * blockDim.y * TILE_SIZE_Y + threadIdx.y * TILE_SIZE_Y;
+    const int b = blockIdx.z * blockDim.z + threadIdx.z;
+    int d;
+    T tmp;
+    T weight;
+    #pragma unroll
+        for (int i = 0; i < TILE_SIZE_X; i++)
+        {
+            d = d_block + threadIdx.x + i * BX;
+            if (d < D && b < B){
+                #pragma unroll
+                for (int t = 0; t < TILE_SIZE_Y; t++){
+                    if (l + t < L_eff - K + 1)
+                    {
+                        set_value(&tmp, bias[d]);
+                        for(int k = 0; k < K; k++){
+                            set_value(&weight, weights[k * D + d]);
+                            tmp = __hfma2(get_u(u, L_eff, l + t, padding, b, k, d, L, D, K), weight, tmp);
+                        }
+                            out[b * D * L_out  + (l + t) * D + d] = tmp;
+                    }
+                }
+            }
+        }
+}
+torch::Tensor conv1d_cuda_blh(
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint l = u.size(1);
+    const uint d = u.size(2);
+    const uint k = weight.size(0);
+    uint l_eff = l + 2 * padding;
+    dim3 blockDims(BX, BY, BZ);
+    dim3 gridDims(ceil(d * 1.0 / (BX * TILE_SIZE_X * 2) ), ceil((l_eff - k + 1) * 1.0 / (BY * TILE_SIZE_Y)), ceil(b * 1.0 / BZ));
+    uint l_out = (l + 2 * padding - k + 1);
+    torch::Tensor out = torch::empty({b, l_out, d}, u.options());
+    //calling seperate kernels for k=3 and k!=3 leads to better perf
+    if(k==3){
+         DISPATCH_FLOAT2_AND_HALF2_AND_BF162(u.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d fwd blh",
+        ([&]
+            { conv1d_kernel_k_3<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<weight_t *>(bias.data_ptr()),
+                    static_cast<input_t *>(out.data_ptr()),
+                    padding,
+                    b,
+                    l,
+                    l_out,
+                    l_eff,
+                    ceil(d/2),
+                    k);
+            }
+        )
+    );
+    }else{
+       DISPATCH_FLOAT2_AND_HALF2_AND_BF162(u.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d fwd blh",
+        ([&]
+            { conv1d_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<weight_t *>(bias.data_ptr()),
+                    static_cast<input_t *>(out.data_ptr()),
+                    padding,
+                    b,
+                    l,
+                    l_out,
+                    l_eff,
+                    ceil(d/2),
+                    k);
+            }
+        )
+    );
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/conv1d/conv1d_bwd_cuda_bhl.cu ADDED Viewed

	@@ -0,0 +1,106 @@

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include "shared.h"
+const uint BX = 128;
+const uint BY = 1;
+const uint BZ = 1;
+const uint TILE_SIZE = 4;
+template <typename input_t, typename weight_t>
+__global__ void conv1d_backward_kernel(
+    const input_t* __restrict__ dout,
+    const input_t* __restrict__ u,
+    const weight_t* __restrict__ weights,
+    input_t* __restrict__ du,
+    input_t* __restrict__ dk,
+    uint B,
+    uint L,
+    uint D,
+    uint K,
+    uint P
+    )
+{
+    const int b = blockIdx.z;
+    const int d = blockIdx.y;
+    const int l = blockIdx.x;
+    //construct the du matrix
+    if(b < B && d < D && l == 0){
+        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        {
+            input_t sum;
+            set_value(&sum, 0.0f);
+            input_t weight;
+            for(int k = 0; k < K ; k++)
+            {
+                int idx = - P + k + j;
+                if(idx >= 0 && idx < L){
+                    set_value(&weight, weights[d * K + K - (k +1)]);
+                    sum = __hfma(dout[b * D * L + d * L + idx], weight, sum);
+                }
+            }
+            du[b * D * L + d * L + j] = sum;
+        }
+    }
+    const int k = blockIdx.x;
+    input_t tmp;
+    //construct the dk matrix
+    if(b < B && d < D && k < K)
+    {
+        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        {
+            if(k - P + j < 0 || k - P + j >= L){
+                set_value(&dk[b * D * K * L + d * K * L + k * L + j], 0.0f);
+            }else{
+                set_value(&dk[b * D * K * L + d * K * L + k * L + j], u[b * D * L + d * L + k - P + j]);
+            }
+        }
+    }
+}
+std::vector<torch::Tensor> conv1d_backward_bhl_cuda(
+    torch::Tensor dout,
+    torch::Tensor u,
+    torch::Tensor weight,
+    torch::Tensor bias,
+    uint padding)
+{
+    const uint b = u.size(0);
+    const uint d = u.size(1);
+    const uint l = u.size(2);
+    const uint k = weight.squeeze().size(1);
+    dim3 blockDims(BX, 1, 1);
+    dim3 gridDims(l, d, b);
+    torch::Tensor du = torch::empty({b, d, l}, u.options());
+    torch::Tensor dk = torch::empty({b, d, k, l}, dout.options());
+    torch::Tensor dbias = dout.sum(-1).sum(0);
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(dout.scalar_type(), weight.scalar_type(),
+        "depthwise conv 1d backward bhl",
+        ([&]
+            { conv1d_backward_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
+                    static_cast<input_t *>(dout.data_ptr()),
+                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<weight_t *>(weight.data_ptr()),
+                    static_cast<input_t *>(du.data_ptr()),
+                    static_cast<input_t *>(dk.data_ptr()),
+                    b,
+                    l,
+                    d,
+                    k,
+                    padding);
+            }
+        )
+    );
+    return {du, torch::matmul(dk, dout.unsqueeze(-1)).squeeze(-1).sum(0).to(weight.type()), dbias};
+}