+
+▶ code
+▼ output
+ ▶ uv-logs
+ |
+Cell: combine | 36.17s
+ |
+
+Raw
+
+
+
+
+
+
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+from pathlib import Path
+
+# Discover the upstream artifact directories from environment variables
+cache_dirs = {
+ "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
+ "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
+ "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
+ "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
+ "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
+ "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
+ "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
+ "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
+ "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
+}
+
+print("LOADING BENCHMARK DATA")
+for name, cache_dir in cache_dirs.items():
+ print(f"{name:30s}: {cache_dir}")
+print()
+
+# Collect all JSONL paths
+all_paths = []
+file_mapping = {
+ "Flash (PyTorch SDPA)": "attn.jsonl",
+ "MemEff (PyTorch SDPA)": "attn.jsonl",
+ "Flash Attn 2": "attn.jsonl",
+ "xFormers": "attn.jsonl",
+ "SageAttention": "attn.jsonl",
+ "Compiled (default)": "attn_default.jsonl",
+ "Compiled (max-autotune)": "attn_max_autotune.jsonl",
+ "HF Kernels Flash Attn": "attn.jsonl",
+ "HF Kernels Flash Attn3": "attn.jsonl",
+}
+
+for name, cache_dir in cache_dirs.items():
+ if cache_dir:
+ jsonl_file = file_mapping[name]
+ path = Path(cache_dir) / jsonl_file
+ if path.exists() and path.stat().st_size > 0:
+ all_paths.append(str(path))
+ print(f"✓ Found {name}: {path}")
+ else:
+ print(f"⊘ Empty/Missing {name}: {path}")
+ else:
+ print(f"✗ No cache dir for {name}")
+
+print()
+
+if not all_paths:
+ print("ERROR: No benchmark data files found!")
+ sys.exit(1)
+
+# Generate combined summary
+print("COMBINED BENCHMARK SUMMARY")
+print()
+
+kbt.summarize(all_paths)
+
+print()
+print("GENERATING COMBINED VISUALIZATION")
+print()
+
+try:
+ kbt.viz(all_paths)
+ print("✓ Combined visualization saved as latency.png")
+except ImportError as e:
+ print(f"✗ Visualization requires matplotlib: {e}")
+except Exception as e:
+ print(f"✗ Visualization failed: {e}")
+
+print()
+print("ANALYSIS COMPLETE")
+print(f"Total implementations analyzed: {len(all_paths)}")
+print(f"\nImplementations included:")
+for name, cache_dir in cache_dirs.items():
+ if cache_dir:
+ jsonl_file = file_mapping[name]
+ path = Path(cache_dir) / jsonl_file
+ if path.exists() and path.stat().st_size > 0:
+ print(f" ✓ {name}")
+
+
+LOADING BENCHMARK DATA
+Flash (PyTorch SDPA) : /repo/flash_attn/impls/.uvnote/cache/327a3408e7cdfeef6984786686ce13137074d9f083e6e434c29f02589d28a0f8
+MemEff (PyTorch SDPA) : /repo/flash_attn/impls/.uvnote/cache/25ca9e52daa50b9289780b3e1302f2949db718140ef9eedd44a8a554afaff9ee
+Flash Attn 2 : None
+xFormers : /repo/flash_attn/impls/.uvnote/cache/6802a31176fbf22c1f5dd5442cf5ae77d8e3527d679642244908984c16933902
+SageAttention : None
+Compiled (default) : /repo/flash_attn/impls/.uvnote/cache/bd779935ea10d468a5a99c29b029da0e0ef4dc2a7b82bc8595d04b2f142a3a44
+Compiled (max-autotune) : /repo/flash_attn/impls/.uvnote/cache/f4bc4785407df53e53f91c190279cdf3dbe3cf7028e2e352d1cc90b92bfcf86e
+HF Kernels Flash Attn : /repo/flash_attn/impls/.uvnote/cache/58c243a8f4effc711ed67ad97c7fbf2124304388a17b4b8e4e43e20e6019e9c9
+HF Kernels Flash Attn3 : /repo/flash_attn/impls/.uvnote/cache/65da999faf55d11c76155fa1d198e77708e1fe8247e3d0b5fd7093a206551ce5
+
+✓ Found Flash (PyTorch SDPA): /repo/flash_attn/impls/.uvnote/cache/327a3408e7cdfeef6984786686ce13137074d9f083e6e434c29f02589d28a0f8/attn.jsonl
+✓ Found MemEff (PyTorch SDPA): /repo/flash_attn/impls/.uvnote/cache/25ca9e52daa50b9289780b3e1302f2949db718140ef9eedd44a8a554afaff9ee/attn.jsonl
+✗ No cache dir for Flash Attn 2
+✓ Found xFormers: /repo/flash_attn/impls/.uvnote/cache/6802a31176fbf22c1f5dd5442cf5ae77d8e3527d679642244908984c16933902/attn.jsonl
+✗ No cache dir for SageAttention
+✓ Found Compiled (default): /repo/flash_attn/impls/.uvnote/cache/bd779935ea10d468a5a99c29b029da0e0ef4dc2a7b82bc8595d04b2f142a3a44/attn_default.jsonl
+✓ Found Compiled (max-autotune): /repo/flash_attn/impls/.uvnote/cache/f4bc4785407df53e53f91c190279cdf3dbe3cf7028e2e352d1cc90b92bfcf86e/attn_max_autotune.jsonl
+✓ Found HF Kernels Flash Attn: /repo/flash_attn/impls/.uvnote/cache/58c243a8f4effc711ed67ad97c7fbf2124304388a17b4b8e4e43e20e6019e9c9/attn.jsonl
+✓ Found HF Kernels Flash Attn3: /repo/flash_attn/impls/.uvnote/cache/65da999faf55d11c76155fa1d198e77708e1fe8247e3d0b5fd7093a206551ce5/attn.jsonl
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_flash_attn flux_L128 0.34 True
+hf_kernels_flash_attn flux_L256 0.37 True
+hf_kernels_flash_attn flux_L320 0.49 True
+hf_kernels_flash_attn flux_L384 0.51 True
+hf_kernels_flash_attn flux_L448 0.53 True
+hf_kernels_flash_attn flux_L512 0.56 True
+hf_kernels_flash_attn3 flux_L128 0.36 True
+hf_kernels_flash_attn3 flux_L256 0.39 True
+hf_kernels_flash_attn3 flux_L320 0.52 True
+hf_kernels_flash_attn3 flux_L384 0.53 True
+hf_kernels_flash_attn3 flux_L448 0.57 True
+hf_kernels_flash_attn3 flux_L512 0.57 True
+torch_flash_compiled_default flux_L128 0.52 True
+torch_flash_compiled_default flux_L256 0.56 True
+torch_flash_compiled_default flux_L320 0.69 True
+torch_flash_compiled_default flux_L384 0.72 True
+torch_flash_compiled_default flux_L448 0.74 True
+torch_flash_compiled_default flux_L512 0.77 True
+torch_flash_compiled_max_autotune flux_L128 0.65 True
+torch_flash_compiled_max_autotune flux_L256 0.68 True
+torch_flash_compiled_max_autotune flux_L320 0.82 True
+torch_flash_compiled_max_autotune flux_L384 0.85 True
+torch_flash_compiled_max_autotune flux_L448 0.88 True
+torch_flash_compiled_max_autotune flux_L512 0.92 True
+torch_flash_ma flux_L128 0.48 True
+torch_flash_ma flux_L256 0.53 True
+torch_flash_ma flux_L320 0.65 True
+torch_flash_ma flux_L384 0.68 True
+torch_flash_ma flux_L448 0.71 True
+torch_flash_ma flux_L512 0.74 True
+torch_mem_eff flux_L128 0.59 True
+torch_mem_eff flux_L256 0.65 True
+torch_mem_eff flux_L320 0.78 True
+torch_mem_eff flux_L384 0.79 True
+torch_mem_eff flux_L448 0.85 True
+torch_mem_eff flux_L512 0.95 True
+xformers_meff flux_L128 0.45 True
+xformers_meff flux_L256 0.47 True
+xformers_meff flux_L320 0.60 True
+xformers_meff flux_L384 0.60 True
+xformers_meff flux_L448 0.64 True
+xformers_meff flux_L512 0.64 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 42 records
+Saved latency.png
+✓ Combined visualization saved as latency.png
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 7
+
+Implementations included:
+ ✓ Flash (PyTorch SDPA)
+ ✓ MemEff (PyTorch SDPA)
+ ✓ xFormers
+ ✓ Compiled (default)
+ ✓ Compiled (max-autotune)
+ ✓ HF Kernels Flash Attn
+ ✓ HF Kernels Flash Attn3
+
+
+
+
+▶ UV Install Logs
+
+
+