diff --git "a/flash_attn/impls/flash_attention.html" "b/flash_attn/impls/flash_attention.html" --- "a/flash_attn/impls/flash_attention.html" +++ "b/flash_attn/impls/flash_attention.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.26s | Raw @@ -4123,16 +3905,16 @@ Cell: nv | 0.21s
-
Fri Oct 31 20:13:43 2025       
+
Mon Nov 10 21:58:51 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   32C    P0            139W /  350W |       0MiB /  46068MiB |     83%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4156,7 +3938,7 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 3.87s
+Cell: benchmark | 4.03s
  | 
 
 Raw
@@ -4207,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.600ms       101.99%       3.600ms       3.600ms             1  
-                                         torch_flash_ma         6.70%     350.157us        46.68%       2.439ms       2.439ms       0.000us         0.00%       3.570ms       3.570ms             1  
-                     aten::scaled_dot_product_attention         0.81%      42.281us         4.26%     222.626us      74.209us       0.000us         0.00%       2.816ms     938.781us             3  
-              aten::_scaled_dot_product_flash_attention         0.52%      27.002us         3.45%     180.345us      60.115us       0.000us         0.00%       2.816ms     938.781us             3  
-                         aten::_flash_attention_forward         0.79%      41.210us         2.54%     132.453us      44.151us       2.816ms        79.78%       2.816ms     938.781us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.816ms        79.78%       2.816ms     938.781us             3  
-                                       aten::contiguous         0.29%      15.041us        34.44%       1.800ms     149.962us       0.000us         0.00%     753.884us      62.824us            12  
-                                            aten::clone         0.75%      38.969us        34.15%       1.785ms     148.709us       0.000us         0.00%     753.884us      62.824us            12  
-                                            aten::copy_         1.73%      90.324us        31.78%       1.661ms     138.388us     713.788us        20.22%     753.884us      62.824us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     713.788us        20.22%     713.788us      59.482us            12  
-                                Activity Buffer Request        28.08%       1.467ms        28.08%       1.467ms       1.467ms      40.096us         1.14%      40.096us      40.096us             1  
-                                        aten::transpose         1.25%      65.371us         1.68%      87.543us       3.648us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.42%      22.172us         0.42%      22.172us       0.924us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.53%      27.463us         2.06%     107.524us       7.168us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.78%      93.220us         1.78%      93.220us       3.884us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.49%     130.035us         2.49%     130.035us       8.669us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.32%      16.730us         0.32%      16.730us       5.577us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.05%       2.690us         0.05%       2.690us       0.448us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.17%       9.000us         0.17%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        53.32%       2.786ms        53.32%       2.786ms       2.786ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.628ms       101.57%       3.628ms       3.628ms             1  
+                                         torch_flash_ma         5.67%     314.697us        48.49%       2.689ms       2.689ms       0.000us         0.00%       3.612ms       3.612ms             1  
+                     aten::scaled_dot_product_attention         0.72%      39.870us         3.84%     213.234us      71.078us       0.000us         0.00%       2.845ms     948.416us             3  
+              aten::_scaled_dot_product_flash_attention         0.43%      24.020us         3.13%     173.364us      57.788us       0.000us         0.00%       2.845ms     948.416us             3  
+                         aten::_flash_attention_forward         0.70%      39.034us         2.33%     129.042us      43.014us       2.845ms        79.65%       2.845ms     948.416us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.845ms        79.65%       2.845ms     948.416us             3  
+                                       aten::contiguous         0.22%      12.191us        37.88%       2.101ms     175.086us       0.000us         0.00%     766.879us      63.907us            12  
+                                            aten::clone         0.59%      32.480us        37.66%       2.089ms     174.070us       0.000us         0.00%     766.879us      63.907us            12  
+                                            aten::copy_         1.56%      86.776us        35.66%       1.978ms     164.799us     726.879us        20.35%     766.879us      63.907us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     726.879us        20.35%     726.879us      60.573us            12  
+                                Activity Buffer Request        32.26%       1.789ms        32.26%       1.789ms       1.789ms      40.000us         1.12%      40.000us      40.000us             1  
+                                        aten::transpose         1.07%      59.612us         1.46%      80.772us       3.365us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.38%      21.160us         0.38%      21.160us       0.882us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.40%      22.459us         1.80%      99.659us       6.644us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.66%      92.037us         1.66%      92.037us       3.835us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.29%     126.900us         2.29%     126.900us       8.460us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      15.620us         0.28%      15.620us       5.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.280us         0.04%       2.280us       0.380us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.20%      11.200us         0.20%      11.200us       3.733us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        51.51%       2.857ms        51.51%       2.857ms       2.857ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.225ms
-Self CUDA time total: 3.530ms
+Self CPU time total: 5.546ms
+Self CUDA time total: 3.572ms
 
 
 
@@ -4239,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.88%     260.255us        42.26%       2.252ms       2.252ms       0.000us         0.00%       3.798ms       3.798ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.753ms       100.28%       3.753ms       3.753ms             1  
-                     aten::scaled_dot_product_attention         0.49%      25.890us         3.50%     186.735us      62.245us       0.000us         0.00%       2.976ms     991.858us             3  
-              aten::_scaled_dot_product_flash_attention         0.33%      17.842us         3.02%     160.845us      53.615us       0.000us         0.00%       2.976ms     991.858us             3  
-                         aten::_flash_attention_forward         0.74%      39.289us         2.26%     120.363us      40.121us       2.976ms        79.51%       2.976ms     991.858us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.976ms        79.51%       2.976ms     991.858us             3  
-                                       aten::contiguous         0.20%      10.403us        33.03%       1.760ms     146.680us       0.000us         0.00%     822.042us      68.504us            12  
-                                            aten::clone         0.53%      28.238us        32.84%       1.750ms     145.813us       0.000us         0.00%     822.042us      68.504us            12  
-                                            aten::copy_         1.51%      80.312us        31.12%       1.659ms     138.210us     766.874us        20.49%     822.042us      68.504us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     766.874us        20.49%     766.874us      63.906us            12  
-                                Activity Buffer Request        28.02%       1.493ms        28.02%       1.493ms       1.493ms      55.168us         1.47%      55.168us      55.168us             1  
-                                        aten::transpose         0.94%      50.313us         1.27%      67.673us       2.820us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.33%      17.360us         0.33%      17.360us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.40%      21.528us         1.56%      83.370us       5.558us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.43%      76.263us         1.43%      76.263us       3.178us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.08%     110.943us         2.08%     110.943us       7.396us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.27%      14.621us         0.27%      14.621us       4.874us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.781us         0.03%       1.781us       0.297us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.08%       4.011us         0.08%       4.011us       1.337us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        57.74%       3.077ms        57.74%       3.077ms       3.077ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.57%     259.472us        46.25%       2.626ms       2.626ms       0.000us         0.00%       3.786ms       3.786ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.742ms       100.27%       3.742ms       3.742ms             1  
+                     aten::scaled_dot_product_attention         0.42%      24.011us         3.41%     193.713us      64.571us       0.000us         0.00%       2.968ms     989.492us             3  
+              aten::_scaled_dot_product_flash_attention         0.33%      18.660us         2.99%     169.702us      56.567us       0.000us         0.00%       2.968ms     989.492us             3  
+                         aten::_flash_attention_forward         0.83%      47.240us         2.21%     125.672us      41.891us       2.968ms        79.55%       2.968ms     989.492us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.968ms        79.55%       2.968ms     989.492us             3  
+                                       aten::contiguous         0.19%      10.613us        37.48%       2.128ms     177.333us       0.000us         0.00%     817.342us      68.112us            12  
+                                            aten::clone         0.52%      29.369us        37.29%       2.117ms     176.448us       0.000us         0.00%     817.342us      68.112us            12  
+                                            aten::copy_         1.41%      80.272us        35.64%       2.023ms     168.619us     762.942us        20.45%     817.342us      68.112us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     762.942us        20.45%     762.942us      63.579us            12  
+                                Activity Buffer Request        32.67%       1.855ms        32.67%       1.855ms       1.855ms      54.400us         1.46%      54.400us      54.400us             1  
+                                        aten::transpose         0.90%      51.353us         1.23%      69.912us       2.913us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      18.559us         0.33%      18.559us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.37%      20.909us         1.47%      83.391us       5.559us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.39%      78.982us         1.39%      78.982us       3.291us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.94%     110.382us         1.94%     110.382us       7.359us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.24%      13.461us         0.24%      13.461us       4.487us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.710us         0.05%       2.710us       0.452us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.09%       4.940us         0.09%       4.940us       1.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.75%       3.052ms        53.75%       3.052ms       3.052ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.329ms
-Self CUDA time total: 3.742ms
+Self CPU time total: 5.678ms
+Self CUDA time total: 3.731ms
 
 
 
@@ -4271,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.87%     262.676us        41.62%       2.245ms       2.245ms       0.000us         0.00%       3.882ms       3.882ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.834ms       100.29%       3.834ms       3.834ms             1  
-                     aten::scaled_dot_product_attention         0.50%      26.770us         3.49%     188.015us      62.672us       0.000us         0.00%       3.044ms       1.015ms             3  
-              aten::_scaled_dot_product_flash_attention         0.35%      18.803us         2.99%     161.245us      53.748us       0.000us         0.00%       3.044ms       1.015ms             3  
-                         aten::_flash_attention_forward         0.74%      39.829us         2.21%     119.102us      39.701us       3.044ms        79.61%       3.044ms       1.015ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.044ms        79.61%       3.044ms       1.015ms             3  
-                                       aten::contiguous         0.18%       9.451us        32.36%       1.746ms     145.465us       0.000us         0.00%     838.367us      69.864us            12  
-                                            aten::clone         0.54%      28.881us        32.18%       1.736ms     144.678us       0.000us         0.00%     838.367us      69.864us            12  
-                                            aten::copy_         1.51%      81.201us        30.48%       1.644ms     137.016us     779.615us        20.39%     838.367us      69.864us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.615us        20.39%     779.615us      64.968us            12  
-                                Activity Buffer Request        27.31%       1.473ms        27.31%       1.473ms       1.473ms      58.752us         1.54%      58.752us      58.752us             1  
-                                        aten::transpose         1.01%      54.592us         1.34%      72.471us       3.020us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.33%      17.879us         0.33%      17.879us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.37%      20.117us         1.53%      82.751us       5.517us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.41%      76.295us         1.41%      76.295us       3.179us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.13%     114.795us         2.13%     114.795us       7.653us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.27%      14.801us         0.27%      14.801us       4.934us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       2.110us         0.04%       2.110us       0.352us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.990us         0.07%       3.990us       1.330us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.38%       3.149ms        58.38%       3.149ms       3.149ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.60%     260.065us        44.20%       2.500ms       2.500ms       0.000us         0.00%       3.871ms       3.871ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.823ms       100.27%       3.823ms       3.823ms             1  
+                     aten::scaled_dot_product_attention         0.46%      25.840us         3.28%     185.632us      61.877us       0.000us         0.00%       3.035ms       1.012ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      17.999us         2.82%     159.792us      53.264us       0.000us         0.00%       3.035ms       1.012ms             3  
+                         aten::_flash_attention_forward         0.73%      41.121us         2.09%     118.472us      39.491us       3.035ms        79.59%       3.035ms       1.012ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.035ms        79.59%       3.035ms       1.012ms             3  
+                                       aten::contiguous         0.19%      10.499us        35.53%       2.010ms     167.521us       0.000us         0.00%     836.093us      69.674us            12  
+                                            aten::clone         0.50%      28.109us        35.35%       2.000ms     166.646us       0.000us         0.00%     836.093us      69.674us            12  
+                                            aten::copy_         1.42%      80.472us        33.72%       1.908ms     158.959us     778.333us        20.41%     836.093us      69.674us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     778.333us        20.41%     778.333us      64.861us            12  
+                                Activity Buffer Request        30.89%       1.747ms        30.89%       1.747ms       1.747ms      57.760us         1.51%      57.760us      57.760us             1  
+                                        aten::transpose         0.88%      49.936us         1.20%      67.813us       2.826us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      17.877us         0.32%      17.877us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.36%      20.321us         1.47%      83.262us       5.551us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.37%      77.333us         1.37%      77.333us       3.222us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.81%     102.481us         1.81%     102.481us       6.832us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      14.120us         0.25%      14.120us       4.707us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.688us         0.03%       1.688us       0.281us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.09%       5.331us         0.09%       5.331us       1.777us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.80%       3.157ms        55.80%       3.157ms       3.157ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.395ms
-Self CUDA time total: 3.823ms
+Self CPU time total: 5.657ms
+Self CUDA time total: 3.813ms
 
 
 
@@ -4303,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.61%     261.106us        43.54%       2.469ms       2.469ms       0.000us         0.00%       3.945ms       3.945ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.898ms       100.28%       3.898ms       3.898ms             1  
-                     aten::scaled_dot_product_attention         0.46%      26.241us         3.40%     192.654us      64.218us       0.000us         0.00%       3.100ms       1.033ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      19.509us         2.94%     166.413us      55.471us       0.000us         0.00%       3.100ms       1.033ms             3  
-                         aten::_flash_attention_forward         0.74%      42.081us         2.16%     122.633us      40.878us       3.100ms        79.76%       3.100ms       1.033ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.100ms        79.76%       3.100ms       1.033ms             3  
-                                       aten::contiguous         0.20%      11.161us        34.71%       1.968ms     163.994us       0.000us         0.00%     844.704us      70.392us            12  
-                                            aten::clone         0.52%      29.682us        34.51%       1.957ms     163.064us       0.000us         0.00%     844.704us      70.392us            12  
-                                            aten::copy_         1.45%      82.261us        32.81%       1.860ms     155.026us     786.784us        20.24%     844.704us      70.392us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     786.784us        20.24%     786.784us      65.565us            12  
-                                Activity Buffer Request        26.26%       1.489ms        26.26%       1.489ms       1.489ms      57.920us         1.49%      57.920us      57.920us             1  
-                                        aten::transpose         0.95%      53.820us         1.26%      71.322us       2.972us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.31%      17.502us         0.31%      17.502us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.39%      21.943us         1.53%      86.983us       5.799us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.40%      79.202us         1.40%      79.202us       3.300us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.55%     314.487us         5.55%     314.487us      20.966us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.26%      14.830us         0.26%      14.830us       4.943us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       2.010us         0.04%       2.010us       0.335us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.040us         0.07%       4.040us       1.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.46%       3.201ms        56.46%       3.201ms       3.201ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.36%     258.876us        46.43%       2.758ms       2.758ms       0.000us         0.00%       3.960ms       3.960ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.911ms       100.27%       3.911ms       3.911ms             1  
+                     aten::scaled_dot_product_attention         0.42%      24.860us         4.02%     238.593us      79.531us       0.000us         0.00%       3.109ms       1.036ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      19.211us         3.60%     213.733us      71.244us       0.000us         0.00%       3.109ms       1.036ms             3  
+                         aten::_flash_attention_forward         0.74%      43.768us         2.88%     170.772us      56.924us       3.109ms        79.70%       3.109ms       1.036ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.109ms        79.70%       3.109ms       1.036ms             3  
+                                       aten::contiguous         0.17%      10.099us        37.27%       2.213ms     184.454us       0.000us         0.00%     850.560us      70.880us            12  
+                                            aten::clone         0.48%      28.250us        37.10%       2.203ms     183.613us       0.000us         0.00%     850.560us      70.880us            12  
+                                            aten::copy_         1.36%      80.903us        35.54%       2.111ms     175.896us     791.680us        20.30%     850.560us      70.880us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     791.680us        20.30%     791.680us      65.973us            12  
+                                Activity Buffer Request        29.13%       1.730ms        29.13%       1.730ms       1.730ms      58.880us         1.51%      58.880us      58.880us             1  
+                                        aten::transpose         0.86%      50.781us         1.18%      70.362us       2.932us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      19.581us         0.33%      19.581us       0.816us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      20.589us         1.40%      83.331us       5.555us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      78.663us         1.32%      78.663us       3.278us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.47%     324.743us         5.47%     324.743us      21.650us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.23%      13.800us         0.23%      13.800us       4.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.80%      47.662us         0.80%      47.662us       7.944us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.930us         0.10%       5.930us       1.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.57%       3.181ms        53.57%       3.181ms       3.181ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.670ms
-Self CUDA time total: 3.887ms
+Self CPU time total: 5.939ms
+Self CUDA time total: 3.901ms
 
 
 
@@ -4335,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         5.12%     312.519us        40.82%       2.493ms       2.493ms       0.000us         0.00%       4.416ms       4.416ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.365ms       100.24%       4.365ms       4.365ms             1  
-                     aten::scaled_dot_product_attention         0.42%      25.922us         3.20%     195.246us      65.082us       0.000us         0.00%       3.547ms       1.182ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      20.847us         2.77%     169.324us      56.441us       0.000us         0.00%       3.547ms       1.182ms             3  
-                         aten::_flash_attention_forward         0.72%      44.243us         2.07%     126.303us      42.101us       3.547ms        81.45%       3.547ms       1.182ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.547ms        81.45%       3.547ms       1.182ms             3  
-                                       aten::contiguous         0.17%      10.559us        31.73%       1.938ms     161.473us       0.000us         0.00%     869.122us      72.427us            12  
-                                            aten::clone         0.47%      28.763us        31.56%       1.927ms     160.593us       0.000us         0.00%     869.122us      72.427us            12  
-                                            aten::copy_         1.36%      83.033us        30.01%       1.832ms     152.707us     807.906us        18.55%     869.122us      72.427us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     807.906us        18.55%     807.906us      67.326us            12  
-                                Activity Buffer Request        24.51%       1.497ms        24.51%       1.497ms       1.497ms      61.216us         1.41%      61.216us      61.216us             1  
-                                        aten::transpose         0.85%      52.195us         1.14%      69.864us       2.911us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      17.669us         0.29%      17.669us       0.736us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.34%      20.921us         1.44%      87.791us       5.853us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.30%      79.270us         1.30%      79.270us       3.303us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.55%     277.575us         4.55%     277.575us      18.505us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.27%      16.520us         0.27%      16.520us       5.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.960us         0.03%       1.960us       0.327us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.040us         0.07%       4.040us       1.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        59.18%       3.614ms        59.18%       3.614ms       3.614ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.85%     313.852us        44.01%       2.846ms       2.846ms       0.000us         0.00%       4.405ms       4.405ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.356ms       100.24%       4.356ms       4.356ms             1  
+                     aten::scaled_dot_product_attention         0.40%      25.602us         2.92%     188.673us      62.891us       0.000us         0.00%       3.542ms       1.181ms             3  
+              aten::_scaled_dot_product_flash_attention         0.29%      18.450us         2.52%     163.071us      54.357us       0.000us         0.00%       3.542ms       1.181ms             3  
+                         aten::_flash_attention_forward         0.66%      42.791us         1.88%     121.422us      40.474us       3.542ms        81.52%       3.542ms       1.181ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.542ms        81.52%       3.542ms       1.181ms             3  
+                                       aten::contiguous         0.15%       9.702us        35.55%       2.299ms     191.596us       0.000us         0.00%     862.461us      71.872us            12  
+                                            aten::clone         0.45%      28.857us        35.40%       2.289ms     190.788us       0.000us         0.00%     862.461us      71.872us            12  
+                                            aten::copy_         1.23%      79.423us        33.92%       2.194ms     182.809us     803.166us        18.48%     862.461us      71.872us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     803.166us        18.48%     803.166us      66.930us            12  
+                                Activity Buffer Request        28.18%       1.822ms        28.18%       1.822ms       1.822ms      59.295us         1.36%      59.295us      59.295us             1  
+                                        aten::transpose         0.77%      49.902us         1.04%      67.461us       2.811us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.27%      17.559us         0.27%      17.559us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      21.611us         1.34%      86.704us       5.780us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.24%      80.042us         1.24%      80.042us       3.335us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.86%     314.554us         4.86%     314.554us      20.970us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.23%      14.691us         0.23%      14.691us       4.897us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.700us         0.03%       1.700us       0.283us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.08%       4.940us         0.08%       4.940us       1.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.99%       3.621ms        55.99%       3.621ms       3.621ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.107ms
-Self CUDA time total: 4.355ms
+Self CPU time total: 6.467ms
+Self CUDA time total: 4.345ms
 
 
 
@@ -4367,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         3.85%     236.256us        38.02%       2.335ms       2.335ms       0.000us         0.00%       4.535ms       4.535ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.485ms       100.25%       4.485ms       4.485ms             1  
-                     aten::scaled_dot_product_attention         0.43%      26.452us         2.98%     183.275us      61.092us       0.000us         0.00%       3.655ms       1.218ms             3  
-              aten::_scaled_dot_product_flash_attention         0.30%      18.620us         2.55%     156.823us      52.274us       0.000us         0.00%       3.655ms       1.218ms             3  
-                         aten::_flash_attention_forward         0.59%      36.060us         1.88%     115.323us      38.441us       3.655ms        81.69%       3.655ms       1.218ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.655ms        81.69%       3.655ms       1.218ms             3  
-                                       aten::contiguous         0.16%       9.770us        30.40%       1.867ms     155.567us       0.000us         0.00%     880.065us      73.339us            12  
-                                            aten::clone         0.46%      28.179us        30.24%       1.857ms     154.753us       0.000us         0.00%     880.065us      73.339us            12  
-                                            aten::copy_         1.36%      83.563us        28.74%       1.765ms     147.054us     819.137us        18.31%     880.065us      73.339us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     819.137us        18.31%     819.137us      68.261us            12  
-                                Activity Buffer Request        23.24%       1.427ms        23.24%       1.427ms       1.427ms      60.928us         1.36%      60.928us      60.928us             1  
-                                        aten::transpose         0.86%      52.980us         1.16%      71.060us       2.961us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      18.080us         0.29%      18.080us       0.753us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.34%      20.930us         1.37%      83.913us       5.594us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.25%      77.043us         1.25%      77.043us       3.210us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.54%     278.990us         4.54%     278.990us      18.599us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.24%      14.661us         0.24%      14.661us       4.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.978us         0.03%       1.978us       0.330us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.06%       3.901us         0.06%       3.901us       1.300us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        61.98%       3.806ms        61.98%       3.806ms       3.806ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         3.49%     226.744us        41.30%       2.682ms       2.682ms       0.000us         0.00%       4.507ms       4.507ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.456ms       100.23%       4.456ms       4.456ms             1  
+                     aten::scaled_dot_product_attention         0.39%      25.000us         2.68%     173.753us      57.918us       0.000us         0.00%       3.635ms       1.212ms             3  
+              aten::_scaled_dot_product_flash_attention         0.28%      18.340us         2.29%     148.753us      49.584us       0.000us         0.00%       3.635ms       1.212ms             3  
+                         aten::_flash_attention_forward         0.53%      34.164us         1.68%     109.263us      36.421us       3.635ms        81.77%       3.635ms       1.212ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.635ms        81.77%       3.635ms       1.212ms             3  
+                                       aten::contiguous         0.14%       8.821us        34.49%       2.240ms     186.626us       0.000us         0.00%     871.422us      72.619us            12  
+                                            aten::clone         0.41%      26.612us        34.36%       2.231ms     185.890us       0.000us         0.00%     871.422us      72.619us            12  
+                                            aten::copy_         1.18%      76.909us        32.95%       2.140ms     178.308us     810.270us        18.23%     871.422us      72.619us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     810.270us        18.23%     810.270us      67.523us            12  
+                                Activity Buffer Request        27.48%       1.784ms        27.48%       1.784ms       1.784ms      61.152us         1.38%      61.152us      61.152us             1  
+                                        aten::transpose         0.71%      45.940us         0.97%      63.019us       2.626us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.26%      17.079us         0.26%      17.079us       0.712us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.30%      19.781us         1.27%      82.742us       5.516us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.21%      78.423us         1.21%      78.423us       3.268us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.62%     300.294us         4.62%     300.294us      20.020us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.21%      13.430us         0.21%      13.430us       4.477us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.02%       1.610us         0.02%       1.610us       0.268us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.648us         0.07%       4.648us       1.549us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.70%       3.811ms        58.70%       3.811ms       3.811ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.141ms
-Self CUDA time total: 4.474ms
+Self CPU time total: 6.493ms
+Self CUDA time total: 4.445ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
+torch_flash_ma           cuda_attn_L128_bfloat16     1.23  True
 torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
 torch_flash_ma           cuda_attn_L320_bfloat16     1.30  True
 torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
-torch_flash_ma           cuda_attn_L448_bfloat16     1.50  True
-torch_flash_ma           cuda_attn_L512_bfloat16     1.51  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.48  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.52  True
 

Artifacts: