# /// script
# requires-python = ">=3.10"
-# dependencies = [
-# "numpy",
-# "torch",
-# "kernels-benchmark-tools",
-# "matplotlib",
-# ]
-#
+# dependencies = ["torch", "kernels-benchmark-tools", "matplotlib"]
# [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
-import os
-import sys
-from pathlib import Path
-import json
-import torch # noqa: F401 # imported because upstream may expect torch to be importable
-import kernels_benchmark_tools as kbt
-
-# --- Matplotlib setup and helpers ------------------------------------------------
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import csv
-
-
-# Keep text as text (not paths) so CSS can style fonts, size, etc.
-mpl.rcParams["svg.fonttype"] = "none"
-# Make ids deterministic across builds
-mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
-# Avoid auto-closed figures interfering with our tagging
-mpl.rcParams["figure.autolayout"] = True
-# Make background transparent
-mpl.rcParams["figure.facecolor"] = "none"
-mpl.rcParams["axes.facecolor"] = "none"
-mpl.rcParams["savefig.facecolor"] = "none"
-mpl.rcParams["savefig.edgecolor"] = "none"
-
-def _slugify(s: str) -> str:
- s = (s or "").strip().lower()
- keep = []
- for ch in s:
- if ch.isalnum():
- keep.append(ch)
- elif ch in (" ", "-", "_", "/", ".", ":"):
- keep.append("-")
- else:
- keep.append("")
- out = "".join(keep)
- while "--" in out:
- out = out.replace("--", "-")
- return out.strip("-") or "unnamed"
-
-def _tag_current_figure(default_series_prefix="series"):
- """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
- fig = plt.gcf()
- if fig is None:
- return
-
- # Tag the figure itself
- fig.set_gid("figure--latency")
-
- for ax_idx, ax in enumerate(fig.get_axes(), start=1):
- ax.set_gid(f"axes--{ax_idx}")
-
- # Axis labels & title
- if ax.get_title():
- for t in ax.texts:
- if t.get_text() == ax.get_title():
- t.set_gid("title--main")
- if ax.xaxis and ax.xaxis.get_label():
- ax.xaxis.label.set_gid("label--x")
- if ax.yaxis and ax.yaxis.get_label():
- ax.yaxis.label.set_gid("label--y")
-
- # Gridlines
- for i, gl in enumerate(ax.get_xgridlines(), start=1):
- gl.set_gid(f"grid-x--{i}")
- for i, gl in enumerate(ax.get_ygridlines(), start=1):
- gl.set_gid(f"grid-y--{i}")
-
- # Legend block & entries
- leg = ax.get_legend()
- if leg is not None:
- leg.set_gid("legend")
- for i, txt in enumerate(leg.get_texts(), start=1):
- label_slug = _slugify(txt.get_text())
- txt.set_gid(f"legend-label--{label_slug or i}")
-
- # Series (lines, patches)
- # Lines
- line_seen = {}
- for ln in getattr(ax, "lines", []):
- raw_label = ln.get_label() or ""
- # Matplotlib uses labels beginning with "_" for non-legendable items
- label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
- slug = _slugify(label)
- line_seen[slug] = line_seen.get(slug, 0) + 1
- suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
- ln.set_gid(f"series--{slug}{suffix}")
-
- # Patches (bars, areas)
- patch_seen = {}
- for pt in getattr(ax, "patches", []):
- label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
- if isinstance(label, str) and label.startswith("_"):
- label = default_series_prefix
- slug = _slugify(label)
- patch_seen[slug] = patch_seen.get(slug, 0) + 1
- suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
- pt.set_gid(f"series--{slug}{suffix}")
-
-def _postprocess_svg_add_classes(svg_path: Path):
- """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
- try:
- import xml.etree.ElementTree as ET
- ET.register_namespace("", "http://www.w3.org/2000/svg")
- tree = ET.parse(svg_path)
- root = tree.getroot()
- for el in root.iter():
- el_id = el.attrib.get("id", "")
- if not el_id:
- continue
- cls = []
- if el_id.startswith("figure--"):
- cls.append("figure")
- elif el_id.startswith("axes--"):
- cls.append("axes")
- elif el_id.startswith("grid-x--"):
- cls += ["grid", "grid-x"]
- elif el_id.startswith("grid-y--"):
- cls += ["grid", "grid-y"]
- elif el_id.startswith("legend"):
- cls.append("legend")
- elif el_id.startswith("label--x"):
- cls.append("xlabel")
- elif el_id.startswith("label--y"):
- cls.append("ylabel")
- elif el_id.startswith("title--"):
- cls.append("title")
- elif el_id.startswith("series--"):
- cls.append("series")
- if cls:
- # Preserve any existing class (unlikely from Matplotlib)
- existing = el.attrib.get("class", "")
- el.set("class", (existing + " " + " ".join(cls)).strip())
- tree.write(svg_path, encoding="utf-8", xml_declaration=True)
- except Exception as e:
- print(f"✗ SVG postprocess (classes) skipped: {e}")
-
-# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
-_orig_savefig = plt.savefig
-def _savefig_svg(fname, *args, **kwargs):
- # Always save as SVG at a stable path for the artifact system
- out = Path("latency.svg")
- kwargs["format"] = "svg"
- # Ensure everything we care about has ids before export
- _tag_current_figure()
- res = _orig_savefig(out, *args, **kwargs)
- # Add helpful CSS classes on top of ids
- _postprocess_svg_add_classes(out)
- print(f"✓ Combined visualization saved as {out}")
- return res
-
-plt.savefig = _savefig_svg # apply patch
-
-# Capture close calls in case kbt.viz() closes figures before we re-save
-_orig_close = plt.close
-_last_closed = {"fig": None}
-def _capture_close(arg=None):
- try:
- if hasattr(arg, "savefig"): # looks like a Figure
- _last_closed["fig"] = arg
- else:
- _last_closed["fig"] = plt.gcf()
- finally:
- return _orig_close(arg)
-plt.close = _capture_close
-
-# --- Locate benchmark artifacts --------------------------------------------------
-cache_dirs = {
- "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
- "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
- "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
- "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
- "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
- "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
- "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
- "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
- "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Note: Flash attention has multiple implementations with different output files
+# Some use attn.jsonl, compiled variants use attn_default.jsonl and attn_max_autotune.jsonl
+cache_env_map = {
+ "Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
+ "MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
+ "xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
+ "Compiled (default)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT",
+ "Compiled (max-autotune)": "UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE",
+ "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
+ "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
}
-print("LOADING BENCHMARK DATA")
-for name, cache_dir in cache_dirs.items():
- print(f"{name:30s}: {cache_dir}")
-print()
+# For flash attention, we need custom file mapping
+import os
+from pathlib import Path
file_mapping = {
- "Flash (PyTorch SDPA)": "attn.jsonl",
- "MemEff (PyTorch SDPA)": "attn.jsonl",
- "Flash Attn 2": "attn.jsonl",
- "xFormers": "attn.jsonl",
- "SageAttention": "attn.jsonl",
"Compiled (default)": "attn_default.jsonl",
"Compiled (max-autotune)": "attn_max_autotune.jsonl",
- "HF Kernels Flash Attn": "attn.jsonl",
- "HF Kernels Flash Attn3": "attn.jsonl",
}
+# Collect paths with custom file names for compiled variants
all_paths = []
-for name, cache_dir in cache_dirs.items():
+for name, env_var in cache_env_map.items():
+ cache_dir = os.environ.get(env_var)
if cache_dir:
- path = Path(cache_dir) / file_mapping[name]
+ filename = file_mapping.get(name, "attn.jsonl")
+ path = Path(cache_dir) / filename
if path.exists() and path.stat().st_size > 0:
all_paths.append(str(path))
print(f"✓ Found {name}: {path}")
else:
- print(f"⊘ Empty/Missing {name}: {path}")
+ print(f"⊘ Skipped {name}: {path}")
else:
- print(f"✗ No cache dir for {name}")
-print()
+ print(f"✗ Missing {name}")
if not all_paths:
print("ERROR: No benchmark data files found!")
- # restore patched functions before exiting
- plt.savefig = _orig_savefig
- plt.close = _orig_close
+ import sys
sys.exit(1)
-# --- Summary + Visualization -----------------------------------------------------
-print("COMBINED BENCHMARK SUMMARY\n")
-kbt.summarize(all_paths)
-print("\nGENERATING COMBINED VISUALIZATION\n")
+# Use the simplified visualization
+from kernels_benchmark_tools.core import tools
+from kernels_benchmark_tools.core.visuals import setup_svg_matplotlib, create_svg_with_tagging
+
+setup_svg_matplotlib()
+_orig_savefig, _orig_close = create_svg_with_tagging("latency.svg", "flash-attention")
try:
- # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
- # and it will carry ids/classes for CSS styling.
- kbt.viz(all_paths)
- # Safety net: if kbt.viz didn't save, save now.
- # if not Path("latency.svg").exists():
- # _tag_current_figure()
- # plt.savefig("latency.svg")
-
- plt.savefig("latency.svg") # ensure saved with tagging
-
- print("✓ SVG visualization ready: latency.svg!")
-except ImportError as e:
- print(f"✗ Visualization requires matplotlib: {e}")
-except Exception as e:
- print(f"✗ Visualization failed: {e}")
+ print("\nCOMBINED BENCHMARK SUMMARY\n")
+ tools.summarize(all_paths)
+
+ print("\nGENERATING COMBINED VISUALIZATION\n")
+ tools.viz(all_paths)
+
+ import matplotlib.pyplot as plt
+ plt.savefig("latency.svg")
+ print("✓ SVG visualization ready!")
finally:
- # Clean up patches to avoid side effects in later cells
plt.savefig = _orig_savefig
plt.close = _orig_close
-
-print()
-print("ANALYSIS COMPLETE")
-print(f"Total implementations analyzed: {len(all_paths)}")
-print(f"\nImplementations included:")
-for name, cache_dir in cache_dirs.items():
- if cache_dir:
- path = Path(cache_dir) / file_mapping[name]
- if path.exists() and path.stat().st_size > 0:
- print(f" ✓ {name}")
-
-
-
-# Collect all benchmark data and export to CSV
-all_data = {}
-for name, cache_dir in cache_dirs.items():
- if cache_dir:
- path = Path(cache_dir) / file_mapping[name]
- if path.exists() and path.stat().st_size > 0:
- with open(path, 'r') as f:
- records = [json.loads(line) for line in f]
- all_data[name] = records
-
-# Export to CSV
-csv_path = Path("latency.csv")
-with open(csv_path, 'w', newline='') as csvfile:
- writer = csv.writer(csvfile)
-
- # Write header
- header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
- "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
- # "Compile (ms)",
- "Peak Mem (MB)", "Backend", "Family"]
- writer.writerow(header)
-
- # Write data rows
- for impl_name, records in all_data.items():
- for record in records:
- wl = record.get('wl', {})
- lat = record.get('lat_ms', {})
- tags = record.get('tags', {})
-
- row = [
- impl_name,
- record.get('impl', ''),
- wl.get('name', ''),
- wl.get('batch', ''),
- wl.get('seq_len', ''),
- wl.get('heads', ''),
- wl.get('head_dim', ''),
- wl.get('dtype', ''),
- lat.get('mean', ''),
- lat.get('p10', ''),
- lat.get('p50', ''),
- lat.get('p90', ''),
- lat.get('reps', ''),
- # record.get('compile_ms', ''),
- round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
- tags.get('backend', ''),
- tags.get('family', ''),
- ]
- writer.writerow(row)
-
-print(f"✓ CSV export complete: {csv_path}")
-print(f"Total implementations: {len(all_data)}")
-print(f"Total records: {sum(len(records) for records in all_data.values())}")
✓ Found Flash (PyTorch SDPA): /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/669be66a140da2b4e0da461648c43c1d5d868bfecd1d86ce3c5cefe7bf6b5095/attn.jsonl +✓ Found MemEff (PyTorch SDPA): /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/d50b1832883940bf8e1b301f9e544d51e67d3edfec86eea4427285c75792eca1/attn.jsonl +✓ Found xFormers: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/7663611410e2106f3875f69749feba575364abf561b07527376e06da9ccbca78/attn.jsonl +✓ Found Compiled (default): /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/7a1d1c41a3d743b4b34d02393e8397ba23383940890d8fd1f4f46e296d62938e/attn_default.jsonl +✓ Found Compiled (max-autotune): /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/314fe0e8997361ffe9c6a2252cce35c8b2184007bb328de7f71e529e42a46d73/attn_max_autotune.jsonl +✓ Found HF Kernels Flash Attn: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/60f199ad338be11b49f2e8e781c982a5f107106c0c1b7f43ced3be59820560c9/attn.jsonl +✓ Found HF Kernels Flash Attn3: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a9419f7d27246259f950b2b3abce9149f1cf0c2d35d1bbc5af7cc75ae37baa6d/attn.jsonl COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -hf_kernels_flash_attn flux_L128 0.25 True -hf_kernels_flash_attn flux_L256 0.32 True -hf_kernels_flash_attn flux_L320 0.34 True -hf_kernels_flash_attn flux_L384 0.35 True -hf_kernels_flash_attn flux_L448 0.38 True -hf_kernels_flash_attn flux_L512 0.42 True -hf_kernels_flash_attn3 flux_L128 0.28 True -hf_kernels_flash_attn3 flux_L256 0.34 True -hf_kernels_flash_attn3 flux_L320 0.36 True -hf_kernels_flash_attn3 flux_L384 0.37 True -hf_kernels_flash_attn3 flux_L448 0.40 True -hf_kernels_flash_attn3 flux_L512 0.43 True -torch_flash_compiled_default flux_L128 0.36 True -torch_flash_compiled_default flux_L256 0.50 True -torch_flash_compiled_default flux_L320 0.54 True -torch_flash_compiled_default flux_L384 0.59 True -torch_flash_compiled_default flux_L448 0.61 True -torch_flash_compiled_default flux_L512 0.64 True -torch_flash_compiled_max_autotune flux_L128 0.38 True -torch_flash_compiled_max_autotune flux_L256 0.55 True -torch_flash_compiled_max_autotune flux_L320 0.61 True -torch_flash_compiled_max_autotune flux_L384 0.66 True -torch_flash_compiled_max_autotune flux_L448 0.70 True -torch_flash_compiled_max_autotune flux_L512 0.76 True -torch_flash_ma flux_L128 0.41 True -torch_flash_ma flux_L256 0.52 True -torch_flash_ma flux_L320 0.55 True -torch_flash_ma flux_L384 0.59 True -torch_flash_ma flux_L448 0.64 True -torch_flash_ma flux_L512 0.68 True -torch_mem_eff flux_L128 0.48 True -torch_mem_eff flux_L256 0.63 True -torch_mem_eff flux_L320 0.70 True -torch_mem_eff flux_L384 0.83 True -torch_mem_eff flux_L448 0.95 True -torch_mem_eff flux_L512 1.00 True -xformers_meff flux_L128 0.35 True -xformers_meff flux_L256 0.41 True -xformers_meff flux_L320 0.43 True -xformers_meff flux_L384 0.44 True -xformers_meff flux_L448 0.48 True -xformers_meff flux_L512 0.50 True +hf_kernels_flash_attn flux_L128 0.12 True +hf_kernels_flash_attn flux_L256 0.14 True +hf_kernels_flash_attn flux_L320 0.14 True +hf_kernels_flash_attn flux_L384 0.15 True +hf_kernels_flash_attn flux_L448 0.20 True +hf_kernels_flash_attn flux_L512 0.20 True +hf_kernels_flash_attn3 flux_L128 0.13 True +hf_kernels_flash_attn3 flux_L256 0.15 True +hf_kernels_flash_attn3 flux_L320 0.16 True +hf_kernels_flash_attn3 flux_L384 0.16 True +hf_kernels_flash_attn3 flux_L448 0.21 True +hf_kernels_flash_attn3 flux_L512 0.21 True +torch_flash_compiled_default flux_L128 0.20 True +torch_flash_compiled_default flux_L256 0.23 True +torch_flash_compiled_default flux_L320 0.24 True +torch_flash_compiled_default flux_L384 0.24 True +torch_flash_compiled_default flux_L448 FAIL False + Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value. +torch_flash_compiled_default flux_L512 FAIL False + Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value. +torch_flash_compiled_max_autotune flux_L128 0.19 True +torch_flash_compiled_max_autotune flux_L256 0.20 True +torch_flash_compiled_max_autotune flux_L320 0.21 True +torch_flash_compiled_max_autotune flux_L384 0.21 True +torch_flash_compiled_max_autotune flux_L448 FAIL False + Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value. +torch_flash_compiled_max_autotune flux_L512 FAIL False + Error: recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value. +torch_flash_ma flux_L128 0.18 True +torch_flash_ma flux_L256 0.21 True +torch_flash_ma flux_L320 0.22 True +torch_flash_ma flux_L384 0.22 True +torch_flash_ma flux_L448 0.27 True +torch_flash_ma flux_L512 0.28 True +torch_mem_eff flux_L128 0.23 True +torch_mem_eff flux_L256 0.26 True +torch_mem_eff flux_L320 0.28 True +torch_mem_eff flux_L384 0.28 True +torch_mem_eff flux_L448 0.30 True +torch_mem_eff flux_L512 0.34 True +xformers_meff flux_L128 0.20 True +xformers_meff flux_L256 0.21 True +xformers_meff flux_L320 0.22 True +xformers_meff flux_L384 0.22 True +xformers_meff flux_L448 0.28 True +xformers_meff flux_L512 0.27 True GENERATING COMBINED VISUALIZATION Loaded 42 records -✓ Combined visualization saved as latency.svg +✓ Visualization saved as latency.svg Saved latency.png -✓ Combined visualization saved as latency.svg -✓ SVG visualization ready: latency.svg! - -ANALYSIS COMPLETE -Total implementations analyzed: 7 - -Implementations included: - ✓ Flash (PyTorch SDPA) - ✓ MemEff (PyTorch SDPA) - ✓ xFormers - ✓ Compiled (default) - ✓ Compiled (max-autotune) - ✓ HF Kernels Flash Attn - ✓ HF Kernels Flash Attn3 -✓ CSV export complete: latency.csv -Total implementations: 7 -Total records: 42 -
Artifacts:
-latency.csv latency.svg -| Implementation | -Impl ID | -Workload | -Batch | -Seq Length | -Heads | -Head Dim | -Dtype | -Mean (ms) | -P10 (ms) | -P50 (ms) | -P90 (ms) | -Reps | -Peak Mem (MB) | -Backend | -Family | -
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Flash (PyTorch SDPA) | -torch_flash_ma | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.407123202085495 | -0.40537598729133606 | -0.40755200386047363 | -0.407584011554718 | -5 | -83.38 | -FLASH | -torch-sdpa | -
| Flash (PyTorch SDPA) | -torch_flash_ma | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.5235007882118226 | -0.5212159752845764 | -0.5232639908790588 | -0.523360013961792 | -5 | -90.62 | -FLASH | -torch-sdpa | -
| Flash (PyTorch SDPA) | -torch_flash_ma | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.545849597454071 | -0.5418559908866882 | -0.5468159914016724 | -0.5469120144844055 | -5 | -95.06 | -FLASH | -torch-sdpa | -
| Flash (PyTorch SDPA) | -torch_flash_ma | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.5892416119575501 | -0.5867519974708557 | -0.5888000130653381 | -0.5888000130653381 | -5 | -99.88 | -FLASH | -torch-sdpa | -
| Flash (PyTorch SDPA) | -torch_flash_ma | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.6449280023574829 | -0.6430720090866089 | -0.6442239880561829 | -0.6450240015983582 | -5 | -103.81 | -FLASH | -torch-sdpa | -
| Flash (PyTorch SDPA) | -torch_flash_ma | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -0.6823423862457275 | -0.6777600049972534 | -0.6809599995613098 | -0.6818559765815735 | -5 | -109.12 | -FLASH | -torch-sdpa | -
| MemEff (PyTorch SDPA) | -torch_mem_eff | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.48371200561523436 | -0.4821760058403015 | -0.4833280146121979 | -0.4853760004043579 | -5 | -83.38 | -EFFICIENT | -torch-sdpa | -
| MemEff (PyTorch SDPA) | -torch_mem_eff | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.6268800020217895 | -0.6246399879455566 | -0.6266880035400391 | -0.6286720037460327 | -5 | -90.62 | -EFFICIENT | -torch-sdpa | -
| MemEff (PyTorch SDPA) | -torch_mem_eff | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.699776005744934 | -0.6973440051078796 | -0.7004160284996033 | -0.7004479765892029 | -5 | -95.94 | -EFFICIENT | -torch-sdpa | -
| MemEff (PyTorch SDPA) | -torch_mem_eff | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.8333312034606933 | -0.8284159898757935 | -0.8325120210647583 | -0.8376320004463196 | -5 | -100.0 | -EFFICIENT | -torch-sdpa | -
| MemEff (PyTorch SDPA) | -torch_mem_eff | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.9533439993858337 | -0.9502720236778259 | -0.9512959718704224 | -0.9572479724884033 | -5 | -103.81 | -EFFICIENT | -torch-sdpa | -
| MemEff (PyTorch SDPA) | -torch_mem_eff | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -1.0066367864608765 | -1.0024960041046143 | -1.0045440196990967 | -1.0097919702529907 | -5 | -109.12 | -EFFICIENT | -torch-sdpa | -
| xFormers | -xformers_meff | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.3452928066253662 | -0.3389439880847931 | -0.3461120128631592 | -0.3461120128631592 | -5 | -83.38 | -memory_efficient | -xformers | -
| xFormers | -xformers_meff | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.41234560012817384 | -0.40959998965263367 | -0.41280001401901245 | -0.41286399960517883 | -5 | -90.62 | -memory_efficient | -xformers | -
| xFormers | -xformers_meff | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.4366208016872406 | -0.4310399889945984 | -0.4331519901752472 | -0.4362240135669708 | -5 | -95.06 | -memory_efficient | -xformers | -
| xFormers | -xformers_meff | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.4450624048709869 | -0.4359680116176605 | -0.44361600279808044 | -0.447488009929657 | -5 | -99.88 | -memory_efficient | -xformers | -
| xFormers | -xformers_meff | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.4750400006771088 | -0.4711039960384369 | -0.47513601183891296 | -0.4763199985027313 | -5 | -103.81 | -memory_efficient | -xformers | -
| xFormers | -xformers_meff | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -0.5009407997131348 | -0.49663999676704407 | -0.4997119903564453 | -0.5038080215454102 | -5 | -109.12 | -memory_efficient | -xformers | -
| Compiled (default) | -torch_flash_compiled_default | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.3856383919715881 | -0.3563520014286041 | -0.35942399501800537 | -0.3624959886074066 | -5 | -83.38 | -FLASH | -torch-sdpa | -
| Compiled (default) | -torch_flash_compiled_default | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.4982912003993988 | -0.4926080107688904 | -0.49663999676704407 | -0.5017600059509277 | -5 | -90.62 | -FLASH | -torch-sdpa | -
| Compiled (default) | -torch_flash_compiled_default | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.5369919896125793 | -0.5335040092468262 | -0.5366079807281494 | -0.5386239886283875 | -5 | -95.25 | -FLASH | -torch-sdpa | -
| Compiled (default) | -torch_flash_compiled_default | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.5841408014297486 | -0.5775359869003296 | -0.5868800282478333 | -0.5877760052680969 | -5 | -99.88 | -FLASH | -torch-sdpa | -
| Compiled (default) | -torch_flash_compiled_default | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.6184704065322876 | -0.6072319746017456 | -0.6113280057907104 | -0.6144000291824341 | -5 | -103.81 | -FLASH | -torch-sdpa | -
| Compiled (default) | -torch_flash_compiled_default | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -0.6428672075271606 | -0.6399999856948853 | -0.6430720090866089 | -0.6430720090866089 | -5 | -109.12 | -FLASH | -torch-sdpa | -
| Compiled (max-autotune) | -torch_flash_compiled_max_autotune | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.40020479559898375 | -0.3665919899940491 | -0.3768320083618164 | -0.41171199083328247 | -5 | -81.75 | -FLASH | -torch-sdpa | -
| Compiled (max-autotune) | -torch_flash_compiled_max_autotune | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.5535807967185974 | -0.5160959959030151 | -0.5489599704742432 | -0.5631359815597534 | -5 | -92.88 | -FLASH | -torch-sdpa | -
| Compiled (max-autotune) | -torch_flash_compiled_max_autotune | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.6143999934196472 | -0.562175989151001 | -0.6144000291824341 | -0.6318079829216003 | -5 | -95.13 | -FLASH | -torch-sdpa | -
| Compiled (max-autotune) | -torch_flash_compiled_max_autotune | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.6754495978355408 | -0.6512640118598938 | -0.6584320068359375 | -0.6799359917640686 | -5 | -97.13 | -FLASH | -torch-sdpa | -
| Compiled (max-autotune) | -torch_flash_compiled_max_autotune | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.7210752129554748 | -0.6973119974136353 | -0.7014080286026001 | -0.7229440212249756 | -5 | -99.0 | -FLASH | -torch-sdpa | -
| Compiled (max-autotune) | -torch_flash_compiled_max_autotune | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -0.7735359907150269 | -0.7485439777374268 | -0.7557439804077148 | -0.7710719704627991 | -5 | -101.63 | -FLASH | -torch-sdpa | -
| HF Kernels Flash Attn | -hf_kernels_flash_attn | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.2456959992647171 | -0.24371199309825897 | -0.24566400051116943 | -0.2457599937915802 | -5 | -83.38 | -flash-attn | -hf-kernels | -
| HF Kernels Flash Attn | -hf_kernels_flash_attn | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.3215551972389221 | -0.3164159953594208 | -0.319487988948822 | -0.32051199674606323 | -5 | -90.62 | -flash-attn | -hf-kernels | -
| HF Kernels Flash Attn | -hf_kernels_flash_attn | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.3384703993797302 | -0.33670398592948914 | -0.33792001008987427 | -0.33983999490737915 | -5 | -95.06 | -flash-attn | -hf-kernels | -
| HF Kernels Flash Attn | -hf_kernels_flash_attn | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.3510208010673523 | -0.3481599986553192 | -0.3491840064525604 | -0.35225600004196167 | -5 | -99.88 | -flash-attn | -hf-kernels | -
| HF Kernels Flash Attn | -hf_kernels_flash_attn | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.3829823970794678 | -0.38095998764038086 | -0.3829759955406189 | -0.3840000033378601 | -5 | -103.81 | -flash-attn | -hf-kernels | -
| HF Kernels Flash Attn | -hf_kernels_flash_attn | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -0.4259391903877258 | -0.4227519929409027 | -0.4249599874019623 | -0.4259839951992035 | -5 | -109.12 | -flash-attn | -hf-kernels | -
| HF Kernels Flash Attn3 | -hf_kernels_flash_attn3 | -flux_L128 | -1 | -1152 | -24 | -128 | -bfloat16 | -0.2755008041858673 | -0.26736000180244446 | -0.27561599016189575 | -0.27955201268196106 | -5 | -83.38 | -flash-attn3 | -hf-kernels | -
| HF Kernels Flash Attn3 | -hf_kernels_flash_attn3 | -flux_L256 | -1 | -1280 | -24 | -128 | -bfloat16 | -0.3397440016269684 | -0.3368000090122223 | -0.3399679958820343 | -0.34191998839378357 | -5 | -90.62 | -flash-attn3 | -hf-kernels | -
| HF Kernels Flash Attn3 | -hf_kernels_flash_attn3 | -flux_L320 | -1 | -1344 | -24 | -128 | -bfloat16 | -0.36019839644432067 | -0.3563520014286041 | -0.3604480028152466 | -0.36137598752975464 | -5 | -95.06 | -flash-attn3 | -hf-kernels | -
| HF Kernels Flash Attn3 | -hf_kernels_flash_attn3 | -flux_L384 | -1 | -1408 | -24 | -128 | -bfloat16 | -0.37342079877853396 | -0.3718400001525879 | -0.37379199266433716 | -0.3746879994869232 | -5 | -99.88 | -flash-attn3 | -hf-kernels | -
| HF Kernels Flash Attn3 | -hf_kernels_flash_attn3 | -flux_L448 | -1 | -1472 | -24 | -128 | -bfloat16 | -0.4024448037147522 | -0.3993600010871887 | -0.4014720022678375 | -0.4034560024738312 | -5 | -103.81 | -flash-attn3 | -hf-kernels | -
| HF Kernels Flash Attn3 | -hf_kernels_flash_attn3 | -flux_L512 | -1 | -1536 | -24 | -128 | -bfloat16 | -0.4305088043212891 | -0.4270080029964447 | -0.4291520118713379 | -0.4331519901752472 | -5 | -109.12 | -flash-attn3 | -hf-kernels | -