# /// script
@@ -3844,13 +5427,173 @@ Cell: combine | 36.17s
# [tool.uv.sources]
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
# ///
-import torch
-import sys
import os
-import kernels_benchmark_tools as kbt
+import sys
from pathlib import Path
+import json
+import torch # noqa: F401 # imported because upstream may expect torch to be importable
+import kernels_benchmark_tools as kbt
+
+# --- Matplotlib setup and helpers ------------------------------------------------
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import csv
+
+
+# Keep text as text (not paths) so CSS can style fonts, size, etc.
+mpl.rcParams["svg.fonttype"] = "none"
+# Make ids deterministic across builds
+mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
+# Avoid auto-closed figures interfering with our tagging
+mpl.rcParams["figure.autolayout"] = True
+# Make background transparent
+mpl.rcParams["figure.facecolor"] = "none"
+mpl.rcParams["axes.facecolor"] = "none"
+mpl.rcParams["savefig.facecolor"] = "none"
+mpl.rcParams["savefig.edgecolor"] = "none"
+
+def _slugify(s: str) -> str:
+ s = (s or "").strip().lower()
+ keep = []
+ for ch in s:
+ if ch.isalnum():
+ keep.append(ch)
+ elif ch in (" ", "-", "_", "/", ".", ":"):
+ keep.append("-")
+ else:
+ keep.append("")
+ out = "".join(keep)
+ while "--" in out:
+ out = out.replace("--", "-")
+ return out.strip("-") or "unnamed"
+
+def _tag_current_figure(default_series_prefix="series"):
+ """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
+ fig = plt.gcf()
+ if fig is None:
+ return
+
+ # Tag the figure itself
+ fig.set_gid("figure--latency")
+
+ for ax_idx, ax in enumerate(fig.get_axes(), start=1):
+ ax.set_gid(f"axes--{ax_idx}")
+
+ # Axis labels & title
+ if ax.get_title():
+ for t in ax.texts:
+ if t.get_text() == ax.get_title():
+ t.set_gid("title--main")
+ if ax.xaxis and ax.xaxis.get_label():
+ ax.xaxis.label.set_gid("label--x")
+ if ax.yaxis and ax.yaxis.get_label():
+ ax.yaxis.label.set_gid("label--y")
+
+ # Gridlines
+ for i, gl in enumerate(ax.get_xgridlines(), start=1):
+ gl.set_gid(f"grid-x--{i}")
+ for i, gl in enumerate(ax.get_ygridlines(), start=1):
+ gl.set_gid(f"grid-y--{i}")
+
+ # Legend block & entries
+ leg = ax.get_legend()
+ if leg is not None:
+ leg.set_gid("legend")
+ for i, txt in enumerate(leg.get_texts(), start=1):
+ label_slug = _slugify(txt.get_text())
+ txt.set_gid(f"legend-label--{label_slug or i}")
+
+ # Series (lines, patches)
+ # Lines
+ line_seen = {}
+ for ln in getattr(ax, "lines", []):
+ raw_label = ln.get_label() or ""
+ # Matplotlib uses labels beginning with "_" for non-legendable items
+ label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
+ slug = _slugify(label)
+ line_seen[slug] = line_seen.get(slug, 0) + 1
+ suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
+ ln.set_gid(f"series--{slug}{suffix}")
+
+ # Patches (bars, areas)
+ patch_seen = {}
+ for pt in getattr(ax, "patches", []):
+ label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
+ if isinstance(label, str) and label.startswith("_"):
+ label = default_series_prefix
+ slug = _slugify(label)
+ patch_seen[slug] = patch_seen.get(slug, 0) + 1
+ suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
+ pt.set_gid(f"series--{slug}{suffix}")
+
+def _postprocess_svg_add_classes(svg_path: Path):
+ """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
+ try:
+ import xml.etree.ElementTree as ET
+ ET.register_namespace("", "http://www.w3.org/2000/svg")
+ tree = ET.parse(svg_path)
+ root = tree.getroot()
+ for el in root.iter():
+ el_id = el.attrib.get("id", "")
+ if not el_id:
+ continue
+ cls = []
+ if el_id.startswith("figure--"):
+ cls.append("figure")
+ elif el_id.startswith("axes--"):
+ cls.append("axes")
+ elif el_id.startswith("grid-x--"):
+ cls += ["grid", "grid-x"]
+ elif el_id.startswith("grid-y--"):
+ cls += ["grid", "grid-y"]
+ elif el_id.startswith("legend"):
+ cls.append("legend")
+ elif el_id.startswith("label--x"):
+ cls.append("xlabel")
+ elif el_id.startswith("label--y"):
+ cls.append("ylabel")
+ elif el_id.startswith("title--"):
+ cls.append("title")
+ elif el_id.startswith("series--"):
+ cls.append("series")
+ if cls:
+ # Preserve any existing class (unlikely from Matplotlib)
+ existing = el.attrib.get("class", "")
+ el.set("class", (existing + " " + " ".join(cls)).strip())
+ tree.write(svg_path, encoding="utf-8", xml_declaration=True)
+ except Exception as e:
+ print(f"✗ SVG postprocess (classes) skipped: {e}")
+
+# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
+_orig_savefig = plt.savefig
+def _savefig_svg(fname, *args, **kwargs):
+ # Always save as SVG at a stable path for the artifact system
+ out = Path("latency.svg")
+ kwargs["format"] = "svg"
+ # Ensure everything we care about has ids before export
+ _tag_current_figure()
+ res = _orig_savefig(out, *args, **kwargs)
+ # Add helpful CSS classes on top of ids
+ _postprocess_svg_add_classes(out)
+ print(f"✓ Combined visualization saved as {out}")
+ return res
+
+plt.savefig = _savefig_svg # apply patch
+
+# Capture close calls in case kbt.viz() closes figures before we re-save
+_orig_close = plt.close
+_last_closed = {"fig": None}
+def _capture_close(arg=None):
+ try:
+ if hasattr(arg, "savefig"): # looks like a Figure
+ _last_closed["fig"] = arg
+ else:
+ _last_closed["fig"] = plt.gcf()
+ finally:
+ return _orig_close(arg)
+plt.close = _capture_close
-# Discover the upstream artifact directories from environment variables
+# --- Locate benchmark artifacts --------------------------------------------------
cache_dirs = {
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
@@ -3868,8 +5611,6 @@ Cell: combine | 36.17s
print(f"{name:30s}: {cache_dir}")
print()
-# Collect all JSONL paths
-all_paths = []
file_mapping = {
"Flash (PyTorch SDPA)": "attn.jsonl",
"MemEff (PyTorch SDPA)": "attn.jsonl",
@@ -3882,10 +5623,10 @@ Cell: combine | 36.17s
"HF Kernels Flash Attn3": "attn.jsonl",
}
+all_paths = []
for name, cache_dir in cache_dirs.items():
if cache_dir:
- jsonl_file = file_mapping[name]
- path = Path(cache_dir) / jsonl_file
+ path = Path(cache_dir) / file_mapping[name]
if path.exists() and path.stat().st_size > 0:
all_paths.append(str(path))
print(f"✓ Found {name}: {path}")
@@ -3893,30 +5634,40 @@ Cell: combine | 36.17s
print(f"⊘ Empty/Missing {name}: {path}")
else:
print(f"✗ No cache dir for {name}")
-
print()
if not all_paths:
print("ERROR: No benchmark data files found!")
+ # restore patched functions before exiting
+ plt.savefig = _orig_savefig
+ plt.close = _orig_close
sys.exit(1)
-# Generate combined summary
-print("COMBINED BENCHMARK SUMMARY")
-print()
-
+# --- Summary + Visualization -----------------------------------------------------
+print("COMBINED BENCHMARK SUMMARY\n")
kbt.summarize(all_paths)
-
-print()
-print("GENERATING COMBINED VISUALIZATION")
-print()
+print("\nGENERATING COMBINED VISUALIZATION\n")
try:
+ # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
+ # and it will carry ids/classes for CSS styling.
kbt.viz(all_paths)
- print("✓ Combined visualization saved as latency.png")
+ # Safety net: if kbt.viz didn't save, save now.
+ # if not Path("latency.svg").exists():
+ # _tag_current_figure()
+ # plt.savefig("latency.svg")
+
+ plt.savefig("latency.svg") # ensure saved with tagging
+
+ print("✓ SVG visualization ready: latency.svg!")
except ImportError as e:
print(f"✗ Visualization requires matplotlib: {e}")
except Exception as e:
print(f"✗ Visualization failed: {e}")
+finally:
+ # Clean up patches to avoid side effects in later cells
+ plt.savefig = _orig_savefig
+ plt.close = _orig_close
print()
print("ANALYSIS COMPLETE")
@@ -3924,17 +5675,72 @@ Cell: combine | 36.17s
print(f"\nImplementations included:")
for name, cache_dir in cache_dirs.items():
if cache_dir:
- jsonl_file = file_mapping[name]
- path = Path(cache_dir) / jsonl_file
+ path = Path(cache_dir) / file_mapping[name]
if path.exists() and path.stat().st_size > 0:
print(f" ✓ {name}")
+
+
+
+# Collect all benchmark data and export to CSV
+all_data = {}
+for name, cache_dir in cache_dirs.items():
+ if cache_dir:
+ path = Path(cache_dir) / file_mapping[name]
+ if path.exists() and path.stat().st_size > 0:
+ with open(path, 'r') as f:
+ records = [json.loads(line) for line in f]
+ all_data[name] = records
+
+# Export to CSV
+csv_path = Path("latency.csv")
+with open(csv_path, 'w', newline='') as csvfile:
+ writer = csv.writer(csvfile)
+
+ # Write header
+ header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
+ "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
+ # "Compile (ms)",
+ "Peak Mem (MB)", "Backend", "Family"]
+ writer.writerow(header)
+
+ # Write data rows
+ for impl_name, records in all_data.items():
+ for record in records:
+ wl = record.get('wl', {})
+ lat = record.get('lat_ms', {})
+ tags = record.get('tags', {})
+
+ row = [
+ impl_name,
+ record.get('impl', ''),
+ wl.get('name', ''),
+ wl.get('batch', ''),
+ wl.get('seq_len', ''),
+ wl.get('heads', ''),
+ wl.get('head_dim', ''),
+ wl.get('dtype', ''),
+ lat.get('mean', ''),
+ lat.get('p10', ''),
+ lat.get('p50', ''),
+ lat.get('p90', ''),
+ lat.get('reps', ''),
+ # record.get('compile_ms', ''),
+ round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
+ tags.get('backend', ''),
+ tags.get('family', ''),
+ ]
+ writer.writerow(row)
+
+print(f"✓ CSV export complete: {csv_path}")
+print(f"Total implementations: {len(all_data)}")
+print(f"Total records: {sum(len(records) for records in all_data.values())}")
Artifacts:
-latency.png +latency.svg +latency.csv
+
+
| Implementation | +Impl ID | +Workload | +Batch | +Seq Length | +Heads | +Head Dim | +Dtype | +Mean (ms) | +P10 (ms) | +P50 (ms) | +P90 (ms) | +Reps | +Peak Mem (MB) | +Backend | +Family | +
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Flash (PyTorch SDPA) | +torch_flash_ma | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.48577280044555665 | +0.47836801409721375 | +0.4803520143032074 | +0.4827199876308441 | +5 | +83.38 | +FLASH | +torch-sdpa | +
| Flash (PyTorch SDPA) | +torch_flash_ma | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.5229184031486511 | +0.521727979183197 | +0.5228800177574158 | +0.5234559774398804 | +5 | +90.62 | +FLASH | +torch-sdpa | +
| Flash (PyTorch SDPA) | +torch_flash_ma | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.6515967845916748 | +0.6503999829292297 | +0.650879979133606 | +0.6513599753379822 | +5 | +95.06 | +FLASH | +torch-sdpa | +
| Flash (PyTorch SDPA) | +torch_flash_ma | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.6807615995407105 | +0.6797440052032471 | +0.6808639764785767 | +0.6815680265426636 | +5 | +99.88 | +FLASH | +torch-sdpa | +
| Flash (PyTorch SDPA) | +torch_flash_ma | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.711027193069458 | +0.7058879733085632 | +0.7121919989585876 | +0.7131519913673401 | +5 | +103.81 | +FLASH | +torch-sdpa | +
| Flash (PyTorch SDPA) | +torch_flash_ma | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.7391423940658569 | +0.7369279861450195 | +0.7383999824523926 | +0.7408959865570068 | +5 | +109.12 | +FLASH | +torch-sdpa | +
| MemEff (PyTorch SDPA) | +torch_mem_eff | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.5875647902488709 | +0.5863680243492126 | +0.5874559879302979 | +0.5876479744911194 | +5 | +83.38 | +EFFICIENT | +torch-sdpa | +
| MemEff (PyTorch SDPA) | +torch_mem_eff | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.653657603263855 | +0.6485440135002136 | +0.6537600159645081 | +0.656544029712677 | +5 | +90.62 | +EFFICIENT | +torch-sdpa | +
| MemEff (PyTorch SDPA) | +torch_mem_eff | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.7784512042999268 | +0.774944007396698 | +0.778656005859375 | +0.7801600098609924 | +5 | +95.94 | +EFFICIENT | +torch-sdpa | +
| MemEff (PyTorch SDPA) | +torch_mem_eff | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.7922943949699401 | +0.791263997554779 | +0.7924799919128418 | +0.7927039861679077 | +5 | +100.0 | +EFFICIENT | +torch-sdpa | +
| MemEff (PyTorch SDPA) | +torch_mem_eff | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.848089587688446 | +0.8444799780845642 | +0.8470079898834229 | +0.8499199748039246 | +5 | +103.81 | +EFFICIENT | +torch-sdpa | +
| MemEff (PyTorch SDPA) | +torch_mem_eff | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.9523776054382325 | +0.95004802942276 | +0.9519039988517761 | +0.9541119933128357 | +5 | +109.12 | +EFFICIENT | +torch-sdpa | +
| xFormers | +xformers_meff | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.45066879987716674 | +0.4474239945411682 | +0.44921600818634033 | +0.45241600275039673 | +5 | +83.38 | +memory_efficient | +xformers | +
| xFormers | +xformers_meff | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.47004159688949587 | +0.4652479887008667 | +0.4705919921398163 | +0.4716799855232239 | +5 | +90.62 | +memory_efficient | +xformers | +
| xFormers | +xformers_meff | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.6022783994674683 | +0.5987840294837952 | +0.6021760106086731 | +0.6045759916305542 | +5 | +95.06 | +memory_efficient | +xformers | +
| xFormers | +xformers_meff | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.6013055920600892 | +0.6000319719314575 | +0.600383996963501 | +0.6016640067100525 | +5 | +99.88 | +memory_efficient | +xformers | +
| xFormers | +xformers_meff | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.6408192038536071 | +0.639136016368866 | +0.6404479742050171 | +0.6416320204734802 | +5 | +103.81 | +memory_efficient | +xformers | +
| xFormers | +xformers_meff | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.6466111898422241 | +0.6447359919548035 | +0.6462399959564209 | +0.6483839750289917 | +5 | +109.12 | +memory_efficient | +xformers | +
| Compiled (default) | +torch_flash_compiled_default | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.527347207069397 | +0.5194560289382935 | +0.5272960066795349 | +0.5312960147857666 | +5 | +83.38 | +FLASH | +torch-sdpa | +
| Compiled (default) | +torch_flash_compiled_default | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.5586367964744567 | +0.5560640096664429 | +0.5571519732475281 | +0.5611839890480042 | +5 | +90.62 | +FLASH | +torch-sdpa | +
| Compiled (default) | +torch_flash_compiled_default | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.6860736012458801 | +0.6841920018196106 | +0.6860160231590271 | +0.6869760155677795 | +5 | +95.25 | +FLASH | +torch-sdpa | +
| Compiled (default) | +torch_flash_compiled_default | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.7167360067367554 | +0.7152000069618225 | +0.7161920070648193 | +0.7164160013198853 | +5 | +99.88 | +FLASH | +torch-sdpa | +
| Compiled (default) | +torch_flash_compiled_default | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.7423295855522156 | +0.7400959730148315 | +0.742143988609314 | +0.7431039810180664 | +5 | +103.81 | +FLASH | +torch-sdpa | +
| Compiled (default) | +torch_flash_compiled_default | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.7743871927261352 | +0.7718080282211304 | +0.7745919823646545 | +0.7748159766197205 | +5 | +109.12 | +FLASH | +torch-sdpa | +
| Compiled (max-autotune) | +torch_flash_compiled_max_autotune | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.6489088058471679 | +0.6148160099983215 | +0.6296960115432739 | +0.6522240042686462 | +5 | +67.5 | +FLASH | +torch-sdpa | +
| Compiled (max-autotune) | +torch_flash_compiled_max_autotune | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.700761592388153 | +0.6615359783172607 | +0.6821119785308838 | +0.7128959894180298 | +5 | +75.0 | +FLASH | +torch-sdpa | +
| Compiled (max-autotune) | +torch_flash_compiled_max_autotune | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.834444797039032 | +0.7967039942741394 | +0.8164799809455872 | +0.8463680148124695 | +5 | +80.38 | +FLASH | +torch-sdpa | +
| Compiled (max-autotune) | +torch_flash_compiled_max_autotune | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.8709375977516174 | +0.8432319760322571 | +0.8498560190200806 | +0.8750079870223999 | +5 | +82.5 | +FLASH | +torch-sdpa | +
| Compiled (max-autotune) | +torch_flash_compiled_max_autotune | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.9069631934165955 | +0.8775359988212585 | +0.9030719995498657 | +0.903872013092041 | +5 | +86.25 | +FLASH | +torch-sdpa | +
| Compiled (max-autotune) | +torch_flash_compiled_max_autotune | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.9371584057807922 | +0.9145920276641846 | +0.9164159893989563 | +0.9357439875602722 | +5 | +90.0 | +FLASH | +torch-sdpa | +
| HF Kernels Flash Attn | +hf_kernels_flash_attn | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.34446719884872434 | +0.3438720107078552 | +0.3445119857788086 | +0.34457600116729736 | +5 | +83.38 | +flash-attn | +hf-kernels | +
| HF Kernels Flash Attn | +hf_kernels_flash_attn | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.37571839094161985 | +0.37404799461364746 | +0.3763839900493622 | +0.3766399919986725 | +5 | +90.62 | +flash-attn | +hf-kernels | +
| HF Kernels Flash Attn | +hf_kernels_flash_attn | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.4945920050144196 | +0.4925439953804016 | +0.493120014667511 | +0.4938240051269531 | +5 | +95.06 | +flash-attn | +hf-kernels | +
| HF Kernels Flash Attn | +hf_kernels_flash_attn | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.5139647841453552 | +0.5123199820518494 | +0.5142719745635986 | +0.5147839784622192 | +5 | +99.88 | +flash-attn | +hf-kernels | +
| HF Kernels Flash Attn | +hf_kernels_flash_attn | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.5353855967521668 | +0.5339199900627136 | +0.5350080132484436 | +0.5352320075035095 | +5 | +103.81 | +flash-attn | +hf-kernels | +
| HF Kernels Flash Attn | +hf_kernels_flash_attn | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.5548800110816956 | +0.5538560152053833 | +0.5548800230026245 | +0.5553280115127563 | +5 | +109.12 | +flash-attn | +hf-kernels | +
| HF Kernels Flash Attn3 | +hf_kernels_flash_attn3 | +flux_L128 | +1 | +1152 | +24 | +128 | +bfloat16 | +0.3617343962192535 | +0.36102399230003357 | +0.3616960048675537 | +0.36211198568344116 | +5 | +83.38 | +flash-attn3 | +hf-kernels | +
| HF Kernels Flash Attn3 | +hf_kernels_flash_attn3 | +flux_L256 | +1 | +1280 | +24 | +128 | +bfloat16 | +0.3907967984676361 | +0.3885439932346344 | +0.39056000113487244 | +0.3906239867210388 | +5 | +90.62 | +flash-attn3 | +hf-kernels | +
| HF Kernels Flash Attn3 | +hf_kernels_flash_attn3 | +flux_L320 | +1 | +1344 | +24 | +128 | +bfloat16 | +0.5228991985321045 | +0.521344006061554 | +0.5230720043182373 | +0.5232319831848145 | +5 | +95.06 | +flash-attn3 | +hf-kernels | +
| HF Kernels Flash Attn3 | +hf_kernels_flash_attn3 | +flux_L384 | +1 | +1408 | +24 | +128 | +bfloat16 | +0.5254656076431274 | +0.523904025554657 | +0.5249919891357422 | +0.526528000831604 | +5 | +99.88 | +flash-attn3 | +hf-kernels | +
| HF Kernels Flash Attn3 | +hf_kernels_flash_attn3 | +flux_L448 | +1 | +1472 | +24 | +128 | +bfloat16 | +0.5646592020988465 | +0.5627840161323547 | +0.565343976020813 | +0.565343976020813 | +5 | +103.81 | +flash-attn3 | +hf-kernels | +
| HF Kernels Flash Attn3 | +hf_kernels_flash_attn3 | +flux_L512 | +1 | +1536 | +24 | +128 | +bfloat16 | +0.5698879957199097 | +0.567359983921051 | +0.5696640014648438 | +0.5698559880256653 | +5 | +109.12 | +flash-attn3 | +hf-kernels | +