diff --git "a/flash_attn/impls/flash_attention.html" "b/flash_attn/impls/flash_attention.html" --- "a/flash_attn/impls/flash_attention.html" +++ "b/flash_attn/impls/flash_attention.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:13:43 2025 +Mon Nov 10 21:58:51 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 43C P0 83W / 350W | 0MiB / 46068MiB | 11% Default | +| N/A 32C P0 139W / 350W | 0MiB / 46068MiB | 83% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4156,7 +3938,7 @@ Cell: nv | 0.21s ▼ output ▶ uv-logs | -Cell: benchmark | 3.87s +Cell: benchmark | 4.03s | Raw @@ -4207,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.600ms 101.99% 3.600ms 3.600ms 1 - torch_flash_ma 6.70% 350.157us 46.68% 2.439ms 2.439ms 0.000us 0.00% 3.570ms 3.570ms 1 - aten::scaled_dot_product_attention 0.81% 42.281us 4.26% 222.626us 74.209us 0.000us 0.00% 2.816ms 938.781us 3 - aten::_scaled_dot_product_flash_attention 0.52% 27.002us 3.45% 180.345us 60.115us 0.000us 0.00% 2.816ms 938.781us 3 - aten::_flash_attention_forward 0.79% 41.210us 2.54% 132.453us 44.151us 2.816ms 79.78% 2.816ms 938.781us 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 79.78% 2.816ms 938.781us 3 - aten::contiguous 0.29% 15.041us 34.44% 1.800ms 149.962us 0.000us 0.00% 753.884us 62.824us 12 - aten::clone 0.75% 38.969us 34.15% 1.785ms 148.709us 0.000us 0.00% 753.884us 62.824us 12 - aten::copy_ 1.73% 90.324us 31.78% 1.661ms 138.388us 713.788us 20.22% 753.884us 62.824us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.788us 20.22% 713.788us 59.482us 12 - Activity Buffer Request 28.08% 1.467ms 28.08% 1.467ms 1.467ms 40.096us 1.14% 40.096us 40.096us 1 - aten::transpose 1.25% 65.371us 1.68% 87.543us 3.648us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.42% 22.172us 0.42% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.53% 27.463us 2.06% 107.524us 7.168us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.78% 93.220us 1.78% 93.220us 3.884us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 2.49% 130.035us 2.49% 130.035us 8.669us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.32% 16.730us 0.32% 16.730us 5.577us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.05% 2.690us 0.05% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.17% 9.000us 0.17% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 53.32% 2.786ms 53.32% 2.786ms 2.786ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.628ms 101.57% 3.628ms 3.628ms 1 + torch_flash_ma 5.67% 314.697us 48.49% 2.689ms 2.689ms 0.000us 0.00% 3.612ms 3.612ms 1 + aten::scaled_dot_product_attention 0.72% 39.870us 3.84% 213.234us 71.078us 0.000us 0.00% 2.845ms 948.416us 3 + aten::_scaled_dot_product_flash_attention 0.43% 24.020us 3.13% 173.364us 57.788us 0.000us 0.00% 2.845ms 948.416us 3 + aten::_flash_attention_forward 0.70% 39.034us 2.33% 129.042us 43.014us 2.845ms 79.65% 2.845ms 948.416us 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.845ms 79.65% 2.845ms 948.416us 3 + aten::contiguous 0.22% 12.191us 37.88% 2.101ms 175.086us 0.000us 0.00% 766.879us 63.907us 12 + aten::clone 0.59% 32.480us 37.66% 2.089ms 174.070us 0.000us 0.00% 766.879us 63.907us 12 + aten::copy_ 1.56% 86.776us 35.66% 1.978ms 164.799us 726.879us 20.35% 766.879us 63.907us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 726.879us 20.35% 726.879us 60.573us 12 + Activity Buffer Request 32.26% 1.789ms 32.26% 1.789ms 1.789ms 40.000us 1.12% 40.000us 40.000us 1 + aten::transpose 1.07% 59.612us 1.46% 80.772us 3.365us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.38% 21.160us 0.38% 21.160us 0.882us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.40% 22.459us 1.80% 99.659us 6.644us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.66% 92.037us 1.66% 92.037us 3.835us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 2.29% 126.900us 2.29% 126.900us 8.460us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.28% 15.620us 0.28% 15.620us 5.207us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.04% 2.280us 0.04% 2.280us 0.380us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.20% 11.200us 0.20% 11.200us 3.733us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 51.51% 2.857ms 51.51% 2.857ms 2.857ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.225ms -Self CUDA time total: 3.530ms +Self CPU time total: 5.546ms +Self CUDA time total: 3.572ms @@ -4239,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.88% 260.255us 42.26% 2.252ms 2.252ms 0.000us 0.00% 3.798ms 3.798ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.753ms 100.28% 3.753ms 3.753ms 1 - aten::scaled_dot_product_attention 0.49% 25.890us 3.50% 186.735us 62.245us 0.000us 0.00% 2.976ms 991.858us 3 - aten::_scaled_dot_product_flash_attention 0.33% 17.842us 3.02% 160.845us 53.615us 0.000us 0.00% 2.976ms 991.858us 3 - aten::_flash_attention_forward 0.74% 39.289us 2.26% 120.363us 40.121us 2.976ms 79.51% 2.976ms 991.858us 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.976ms 79.51% 2.976ms 991.858us 3 - aten::contiguous 0.20% 10.403us 33.03% 1.760ms 146.680us 0.000us 0.00% 822.042us 68.504us 12 - aten::clone 0.53% 28.238us 32.84% 1.750ms 145.813us 0.000us 0.00% 822.042us 68.504us 12 - aten::copy_ 1.51% 80.312us 31.12% 1.659ms 138.210us 766.874us 20.49% 822.042us 68.504us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 766.874us 20.49% 766.874us 63.906us 12 - Activity Buffer Request 28.02% 1.493ms 28.02% 1.493ms 1.493ms 55.168us 1.47% 55.168us 55.168us 1 - aten::transpose 0.94% 50.313us 1.27% 67.673us 2.820us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.33% 17.360us 0.33% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.40% 21.528us 1.56% 83.370us 5.558us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.43% 76.263us 1.43% 76.263us 3.178us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 2.08% 110.943us 2.08% 110.943us 7.396us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.27% 14.621us 0.27% 14.621us 4.874us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.03% 1.781us 0.03% 1.781us 0.297us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.08% 4.011us 0.08% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 57.74% 3.077ms 57.74% 3.077ms 3.077ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.57% 259.472us 46.25% 2.626ms 2.626ms 0.000us 0.00% 3.786ms 3.786ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.742ms 100.27% 3.742ms 3.742ms 1 + aten::scaled_dot_product_attention 0.42% 24.011us 3.41% 193.713us 64.571us 0.000us 0.00% 2.968ms 989.492us 3 + aten::_scaled_dot_product_flash_attention 0.33% 18.660us 2.99% 169.702us 56.567us 0.000us 0.00% 2.968ms 989.492us 3 + aten::_flash_attention_forward 0.83% 47.240us 2.21% 125.672us 41.891us 2.968ms 79.55% 2.968ms 989.492us 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.968ms 79.55% 2.968ms 989.492us 3 + aten::contiguous 0.19% 10.613us 37.48% 2.128ms 177.333us 0.000us 0.00% 817.342us 68.112us 12 + aten::clone 0.52% 29.369us 37.29% 2.117ms 176.448us 0.000us 0.00% 817.342us 68.112us 12 + aten::copy_ 1.41% 80.272us 35.64% 2.023ms 168.619us 762.942us 20.45% 817.342us 68.112us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.942us 20.45% 762.942us 63.579us 12 + Activity Buffer Request 32.67% 1.855ms 32.67% 1.855ms 1.855ms 54.400us 1.46% 54.400us 54.400us 1 + aten::transpose 0.90% 51.353us 1.23% 69.912us 2.913us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.33% 18.559us 0.33% 18.559us 0.773us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.37% 20.909us 1.47% 83.391us 5.559us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.39% 78.982us 1.39% 78.982us 3.291us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 1.94% 110.382us 1.94% 110.382us 7.359us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.24% 13.461us 0.24% 13.461us 4.487us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.05% 2.710us 0.05% 2.710us 0.452us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.09% 4.940us 0.09% 4.940us 1.647us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.75% 3.052ms 53.75% 3.052ms 3.052ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.329ms -Self CUDA time total: 3.742ms +Self CPU time total: 5.678ms +Self CUDA time total: 3.731ms @@ -4271,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.87% 262.676us 41.62% 2.245ms 2.245ms 0.000us 0.00% 3.882ms 3.882ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.834ms 100.29% 3.834ms 3.834ms 1 - aten::scaled_dot_product_attention 0.50% 26.770us 3.49% 188.015us 62.672us 0.000us 0.00% 3.044ms 1.015ms 3 - aten::_scaled_dot_product_flash_attention 0.35% 18.803us 2.99% 161.245us 53.748us 0.000us 0.00% 3.044ms 1.015ms 3 - aten::_flash_attention_forward 0.74% 39.829us 2.21% 119.102us 39.701us 3.044ms 79.61% 3.044ms 1.015ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.044ms 79.61% 3.044ms 1.015ms 3 - aten::contiguous 0.18% 9.451us 32.36% 1.746ms 145.465us 0.000us 0.00% 838.367us 69.864us 12 - aten::clone 0.54% 28.881us 32.18% 1.736ms 144.678us 0.000us 0.00% 838.367us 69.864us 12 - aten::copy_ 1.51% 81.201us 30.48% 1.644ms 137.016us 779.615us 20.39% 838.367us 69.864us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.615us 20.39% 779.615us 64.968us 12 - Activity Buffer Request 27.31% 1.473ms 27.31% 1.473ms 1.473ms 58.752us 1.54% 58.752us 58.752us 1 - aten::transpose 1.01% 54.592us 1.34% 72.471us 3.020us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.33% 17.879us 0.33% 17.879us 0.745us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.37% 20.117us 1.53% 82.751us 5.517us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.41% 76.295us 1.41% 76.295us 3.179us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 2.13% 114.795us 2.13% 114.795us 7.653us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.27% 14.801us 0.27% 14.801us 4.934us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 3.990us 0.07% 3.990us 1.330us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.38% 3.149ms 58.38% 3.149ms 3.149ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.60% 260.065us 44.20% 2.500ms 2.500ms 0.000us 0.00% 3.871ms 3.871ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.823ms 100.27% 3.823ms 3.823ms 1 + aten::scaled_dot_product_attention 0.46% 25.840us 3.28% 185.632us 61.877us 0.000us 0.00% 3.035ms 1.012ms 3 + aten::_scaled_dot_product_flash_attention 0.32% 17.999us 2.82% 159.792us 53.264us 0.000us 0.00% 3.035ms 1.012ms 3 + aten::_flash_attention_forward 0.73% 41.121us 2.09% 118.472us 39.491us 3.035ms 79.59% 3.035ms 1.012ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 79.59% 3.035ms 1.012ms 3 + aten::contiguous 0.19% 10.499us 35.53% 2.010ms 167.521us 0.000us 0.00% 836.093us 69.674us 12 + aten::clone 0.50% 28.109us 35.35% 2.000ms 166.646us 0.000us 0.00% 836.093us 69.674us 12 + aten::copy_ 1.42% 80.472us 33.72% 1.908ms 158.959us 778.333us 20.41% 836.093us 69.674us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 778.333us 20.41% 778.333us 64.861us 12 + Activity Buffer Request 30.89% 1.747ms 30.89% 1.747ms 1.747ms 57.760us 1.51% 57.760us 57.760us 1 + aten::transpose 0.88% 49.936us 1.20% 67.813us 2.826us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.32% 17.877us 0.32% 17.877us 0.745us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.36% 20.321us 1.47% 83.262us 5.551us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.37% 77.333us 1.37% 77.333us 3.222us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 1.81% 102.481us 1.81% 102.481us 6.832us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.25% 14.120us 0.25% 14.120us 4.707us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.03% 1.688us 0.03% 1.688us 0.281us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.09% 5.331us 0.09% 5.331us 1.777us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.80% 3.157ms 55.80% 3.157ms 3.157ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.395ms -Self CUDA time total: 3.823ms +Self CPU time total: 5.657ms +Self CUDA time total: 3.813ms @@ -4303,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.61% 261.106us 43.54% 2.469ms 2.469ms 0.000us 0.00% 3.945ms 3.945ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.898ms 100.28% 3.898ms 3.898ms 1 - aten::scaled_dot_product_attention 0.46% 26.241us 3.40% 192.654us 64.218us 0.000us 0.00% 3.100ms 1.033ms 3 - aten::_scaled_dot_product_flash_attention 0.34% 19.509us 2.94% 166.413us 55.471us 0.000us 0.00% 3.100ms 1.033ms 3 - aten::_flash_attention_forward 0.74% 42.081us 2.16% 122.633us 40.878us 3.100ms 79.76% 3.100ms 1.033ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 79.76% 3.100ms 1.033ms 3 - aten::contiguous 0.20% 11.161us 34.71% 1.968ms 163.994us 0.000us 0.00% 844.704us 70.392us 12 - aten::clone 0.52% 29.682us 34.51% 1.957ms 163.064us 0.000us 0.00% 844.704us 70.392us 12 - aten::copy_ 1.45% 82.261us 32.81% 1.860ms 155.026us 786.784us 20.24% 844.704us 70.392us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 786.784us 20.24% 786.784us 65.565us 12 - Activity Buffer Request 26.26% 1.489ms 26.26% 1.489ms 1.489ms 57.920us 1.49% 57.920us 57.920us 1 - aten::transpose 0.95% 53.820us 1.26% 71.322us 2.972us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.31% 17.502us 0.31% 17.502us 0.729us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.39% 21.943us 1.53% 86.983us 5.799us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.40% 79.202us 1.40% 79.202us 3.300us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 5.55% 314.487us 5.55% 314.487us 20.966us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.26% 14.830us 0.26% 14.830us 4.943us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.04% 2.010us 0.04% 2.010us 0.335us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 56.46% 3.201ms 56.46% 3.201ms 3.201ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.36% 258.876us 46.43% 2.758ms 2.758ms 0.000us 0.00% 3.960ms 3.960ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.911ms 100.27% 3.911ms 3.911ms 1 + aten::scaled_dot_product_attention 0.42% 24.860us 4.02% 238.593us 79.531us 0.000us 0.00% 3.109ms 1.036ms 3 + aten::_scaled_dot_product_flash_attention 0.32% 19.211us 3.60% 213.733us 71.244us 0.000us 0.00% 3.109ms 1.036ms 3 + aten::_flash_attention_forward 0.74% 43.768us 2.88% 170.772us 56.924us 3.109ms 79.70% 3.109ms 1.036ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.109ms 79.70% 3.109ms 1.036ms 3 + aten::contiguous 0.17% 10.099us 37.27% 2.213ms 184.454us 0.000us 0.00% 850.560us 70.880us 12 + aten::clone 0.48% 28.250us 37.10% 2.203ms 183.613us 0.000us 0.00% 850.560us 70.880us 12 + aten::copy_ 1.36% 80.903us 35.54% 2.111ms 175.896us 791.680us 20.30% 850.560us 70.880us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 791.680us 20.30% 791.680us 65.973us 12 + Activity Buffer Request 29.13% 1.730ms 29.13% 1.730ms 1.730ms 58.880us 1.51% 58.880us 58.880us 1 + aten::transpose 0.86% 50.781us 1.18% 70.362us 2.932us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.33% 19.581us 0.33% 19.581us 0.816us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.35% 20.589us 1.40% 83.331us 5.555us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.32% 78.663us 1.32% 78.663us 3.278us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 5.47% 324.743us 5.47% 324.743us 21.650us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.23% 13.800us 0.23% 13.800us 4.600us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.80% 47.662us 0.80% 47.662us 7.944us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.930us 0.10% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.57% 3.181ms 53.57% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.670ms -Self CUDA time total: 3.887ms +Self CPU time total: 5.939ms +Self CUDA time total: 3.901ms @@ -4335,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 5.12% 312.519us 40.82% 2.493ms 2.493ms 0.000us 0.00% 4.416ms 4.416ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.365ms 100.24% 4.365ms 4.365ms 1 - aten::scaled_dot_product_attention 0.42% 25.922us 3.20% 195.246us 65.082us 0.000us 0.00% 3.547ms 1.182ms 3 - aten::_scaled_dot_product_flash_attention 0.34% 20.847us 2.77% 169.324us 56.441us 0.000us 0.00% 3.547ms 1.182ms 3 - aten::_flash_attention_forward 0.72% 44.243us 2.07% 126.303us 42.101us 3.547ms 81.45% 3.547ms 1.182ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.547ms 81.45% 3.547ms 1.182ms 3 - aten::contiguous 0.17% 10.559us 31.73% 1.938ms 161.473us 0.000us 0.00% 869.122us 72.427us 12 - aten::clone 0.47% 28.763us 31.56% 1.927ms 160.593us 0.000us 0.00% 869.122us 72.427us 12 - aten::copy_ 1.36% 83.033us 30.01% 1.832ms 152.707us 807.906us 18.55% 869.122us 72.427us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 807.906us 18.55% 807.906us 67.326us 12 - Activity Buffer Request 24.51% 1.497ms 24.51% 1.497ms 1.497ms 61.216us 1.41% 61.216us 61.216us 1 - aten::transpose 0.85% 52.195us 1.14% 69.864us 2.911us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.29% 17.669us 0.29% 17.669us 0.736us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.34% 20.921us 1.44% 87.791us 5.853us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.30% 79.270us 1.30% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 4.55% 277.575us 4.55% 277.575us 18.505us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.27% 16.520us 0.27% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.03% 1.960us 0.03% 1.960us 0.327us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 59.18% 3.614ms 59.18% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.85% 313.852us 44.01% 2.846ms 2.846ms 0.000us 0.00% 4.405ms 4.405ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.356ms 100.24% 4.356ms 4.356ms 1 + aten::scaled_dot_product_attention 0.40% 25.602us 2.92% 188.673us 62.891us 0.000us 0.00% 3.542ms 1.181ms 3 + aten::_scaled_dot_product_flash_attention 0.29% 18.450us 2.52% 163.071us 54.357us 0.000us 0.00% 3.542ms 1.181ms 3 + aten::_flash_attention_forward 0.66% 42.791us 1.88% 121.422us 40.474us 3.542ms 81.52% 3.542ms 1.181ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.542ms 81.52% 3.542ms 1.181ms 3 + aten::contiguous 0.15% 9.702us 35.55% 2.299ms 191.596us 0.000us 0.00% 862.461us 71.872us 12 + aten::clone 0.45% 28.857us 35.40% 2.289ms 190.788us 0.000us 0.00% 862.461us 71.872us 12 + aten::copy_ 1.23% 79.423us 33.92% 2.194ms 182.809us 803.166us 18.48% 862.461us 71.872us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 803.166us 18.48% 803.166us 66.930us 12 + Activity Buffer Request 28.18% 1.822ms 28.18% 1.822ms 1.822ms 59.295us 1.36% 59.295us 59.295us 1 + aten::transpose 0.77% 49.902us 1.04% 67.461us 2.811us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.27% 17.559us 0.27% 17.559us 0.732us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.33% 21.611us 1.34% 86.704us 5.780us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.24% 80.042us 1.24% 80.042us 3.335us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 4.86% 314.554us 4.86% 314.554us 20.970us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.23% 14.691us 0.23% 14.691us 4.897us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.08% 4.940us 0.08% 4.940us 1.647us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.99% 3.621ms 55.99% 3.621ms 3.621ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.107ms -Self CUDA time total: 4.355ms +Self CPU time total: 6.467ms +Self CUDA time total: 4.345ms @@ -4367,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 3.85% 236.256us 38.02% 2.335ms 2.335ms 0.000us 0.00% 4.535ms 4.535ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.485ms 100.25% 4.485ms 4.485ms 1 - aten::scaled_dot_product_attention 0.43% 26.452us 2.98% 183.275us 61.092us 0.000us 0.00% 3.655ms 1.218ms 3 - aten::_scaled_dot_product_flash_attention 0.30% 18.620us 2.55% 156.823us 52.274us 0.000us 0.00% 3.655ms 1.218ms 3 - aten::_flash_attention_forward 0.59% 36.060us 1.88% 115.323us 38.441us 3.655ms 81.69% 3.655ms 1.218ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 81.69% 3.655ms 1.218ms 3 - aten::contiguous 0.16% 9.770us 30.40% 1.867ms 155.567us 0.000us 0.00% 880.065us 73.339us 12 - aten::clone 0.46% 28.179us 30.24% 1.857ms 154.753us 0.000us 0.00% 880.065us 73.339us 12 - aten::copy_ 1.36% 83.563us 28.74% 1.765ms 147.054us 819.137us 18.31% 880.065us 73.339us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 819.137us 18.31% 819.137us 68.261us 12 - Activity Buffer Request 23.24% 1.427ms 23.24% 1.427ms 1.427ms 60.928us 1.36% 60.928us 60.928us 1 - aten::transpose 0.86% 52.980us 1.16% 71.060us 2.961us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.29% 18.080us 0.29% 18.080us 0.753us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.34% 20.930us 1.37% 83.913us 5.594us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.25% 77.043us 1.25% 77.043us 3.210us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 4.54% 278.990us 4.54% 278.990us 18.599us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.24% 14.661us 0.24% 14.661us 4.887us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.03% 1.978us 0.03% 1.978us 0.330us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.06% 3.901us 0.06% 3.901us 1.300us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 61.98% 3.806ms 61.98% 3.806ms 3.806ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 3.49% 226.744us 41.30% 2.682ms 2.682ms 0.000us 0.00% 4.507ms 4.507ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.456ms 100.23% 4.456ms 4.456ms 1 + aten::scaled_dot_product_attention 0.39% 25.000us 2.68% 173.753us 57.918us 0.000us 0.00% 3.635ms 1.212ms 3 + aten::_scaled_dot_product_flash_attention 0.28% 18.340us 2.29% 148.753us 49.584us 0.000us 0.00% 3.635ms 1.212ms 3 + aten::_flash_attention_forward 0.53% 34.164us 1.68% 109.263us 36.421us 3.635ms 81.77% 3.635ms 1.212ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.635ms 81.77% 3.635ms 1.212ms 3 + aten::contiguous 0.14% 8.821us 34.49% 2.240ms 186.626us 0.000us 0.00% 871.422us 72.619us 12 + aten::clone 0.41% 26.612us 34.36% 2.231ms 185.890us 0.000us 0.00% 871.422us 72.619us 12 + aten::copy_ 1.18% 76.909us 32.95% 2.140ms 178.308us 810.270us 18.23% 871.422us 72.619us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 810.270us 18.23% 810.270us 67.523us 12 + Activity Buffer Request 27.48% 1.784ms 27.48% 1.784ms 1.784ms 61.152us 1.38% 61.152us 61.152us 1 + aten::transpose 0.71% 45.940us 0.97% 63.019us 2.626us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.26% 17.079us 0.26% 17.079us 0.712us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.30% 19.781us 1.27% 82.742us 5.516us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.21% 78.423us 1.21% 78.423us 3.268us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 4.62% 300.294us 4.62% 300.294us 20.020us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.21% 13.430us 0.21% 13.430us 4.477us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.02% 1.610us 0.02% 1.610us 0.268us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.07% 4.648us 0.07% 4.648us 1.549us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.70% 3.811ms 58.70% 3.811ms 3.811ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.141ms -Self CUDA time total: 4.474ms +Self CPU time total: 6.493ms +Self CUDA time total: 4.445ms impl wl p50(ms) ok -torch_flash_ma cuda_attn_L128_bfloat16 1.22 True +torch_flash_ma cuda_attn_L128_bfloat16 1.23 True torch_flash_ma cuda_attn_L256_bfloat16 1.28 True torch_flash_ma cuda_attn_L320_bfloat16 1.30 True torch_flash_ma cuda_attn_L384_bfloat16 1.33 True -torch_flash_ma cuda_attn_L448_bfloat16 1.50 True -torch_flash_ma cuda_attn_L512_bfloat16 1.51 True +torch_flash_ma cuda_attn_L448_bfloat16 1.48 True +torch_flash_ma cuda_attn_L512_bfloat16 1.52 TrueArtifacts: