diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:00:00 2025 +Mon Nov 10 21:57:39 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 32C P0 101W / 350W | 0MiB / 46068MiB | 100% Default | +| N/A 26C P0 88W / 350W | 0MiB / 46068MiB | 22% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4156,7 +3938,7 @@ Cell: nv | 0.23s ▼ output ▶ uv-logs | -Cell: benchmark | 4.67s +Cell: benchmark | 4.74s | Raw @@ -4227,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 426.303us 1837.51% 426.303us 426.303us 1 - hf_kernels_rotary 12.40% 260.056us 99.66% 2.090ms 2.090ms 0.000us 0.00% 24.480us 24.480us 1 - _rotary_dba7d1e::apply_rotary 2.75% 57.674us 5.07% 106.315us 17.719us 16.128us 69.52% 16.128us 2.688us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 69.52% 16.128us 2.688us 6 - aten::clone 2.13% 44.582us 79.34% 1.664ms 277.309us 0.000us 0.00% 8.352us 1.392us 6 - aten::copy_ 1.84% 38.562us 74.44% 1.561ms 260.165us 7.072us 30.48% 8.352us 1.392us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.072us 30.48% 7.072us 1.179us 6 - Activity Buffer Request 69.01% 1.447ms 69.01% 1.447ms 1.447ms 1.280us 5.52% 1.280us 1.280us 1 - aten::empty_strided 2.78% 58.281us 2.78% 58.281us 9.713us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.58% 75.121us 3.58% 75.121us 12.520us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.14% 44.780us 2.85% 59.790us 4.983us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.72% 15.010us 0.72% 15.010us 1.251us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.32% 48.641us 2.32% 48.641us 8.107us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.34% 7.100us 0.34% 7.100us 7.100us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 403.678us 1730.44% 403.678us 403.678us 1 + hf_kernels_rotary 9.63% 231.023us 99.37% 2.384ms 2.384ms 0.000us 0.00% 24.608us 24.608us 1 + _rotary_dba7d1e::apply_rotary 2.18% 52.340us 4.07% 97.602us 16.267us 16.224us 69.55% 16.224us 2.704us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.224us 69.55% 16.224us 2.704us 6 + aten::clone 1.53% 36.662us 83.59% 2.005ms 334.171us 0.000us 0.00% 8.384us 1.397us 6 + aten::copy_ 1.80% 43.260us 79.70% 1.912ms 318.600us 7.104us 30.45% 8.384us 1.397us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.104us 30.45% 7.104us 1.184us 6 + Activity Buffer Request 74.82% 1.795ms 74.82% 1.795ms 1.795ms 1.280us 5.49% 1.280us 1.280us 1 + aten::empty_strided 2.37% 56.761us 2.37% 56.761us 9.460us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.07% 73.591us 3.07% 73.591us 12.265us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.65% 39.481us 2.08% 49.901us 4.158us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.43% 10.420us 0.43% 10.420us 0.868us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.89% 45.262us 1.89% 45.262us 7.544us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.63% 15.070us 0.63% 15.070us 15.070us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.097ms -Self CUDA time total: 23.200us +Self CPU time total: 2.399ms +Self CUDA time total: 23.328us @@ -4253,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 340.796us 1422.00% 340.796us 340.796us 1 - hf_kernels_rotary 9.48% 182.026us 99.73% 1.916ms 1.916ms 0.000us 0.00% 25.278us 25.278us 1 - _rotary_dba7d1e::apply_rotary 2.22% 42.701us 4.40% 84.531us 14.088us 16.159us 67.42% 16.159us 2.693us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.159us 67.42% 16.159us 2.693us 6 - aten::clone 1.41% 27.120us 83.58% 1.605ms 267.570us 0.000us 0.00% 9.119us 1.520us 6 - aten::copy_ 2.02% 38.773us 80.45% 1.545ms 257.555us 7.807us 32.58% 9.119us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 32.58% 7.807us 1.301us 6 - Activity Buffer Request 75.56% 1.451ms 75.56% 1.451ms 1.451ms 1.312us 5.47% 1.312us 1.312us 1 - aten::empty_strided 1.72% 32.970us 1.72% 32.970us 5.495us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.88% 55.291us 2.88% 55.291us 9.215us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.76% 33.749us 2.27% 43.642us 3.637us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.52% 9.893us 0.52% 9.893us 0.824us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.18% 41.830us 2.18% 41.830us 6.972us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.27% 5.161us 0.27% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.494us 1388.06% 334.494us 334.494us 1 + hf_kernels_rotary 8.19% 181.152us 99.73% 2.206ms 2.206ms 0.000us 0.00% 25.410us 25.410us 1 + _rotary_dba7d1e::apply_rotary 1.81% 39.991us 3.60% 79.751us 13.292us 16.193us 67.20% 16.193us 2.699us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.193us 67.20% 16.193us 2.699us 6 + aten::clone 1.33% 29.430us 86.17% 1.906ms 317.722us 0.000us 0.00% 9.217us 1.536us 6 + aten::copy_ 1.70% 37.720us 83.32% 1.843ms 307.237us 7.905us 32.80% 9.217us 1.536us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 32.80% 7.905us 1.317us 6 + Activity Buffer Request 79.13% 1.751ms 79.13% 1.751ms 1.751ms 1.312us 5.44% 1.312us 1.312us 1 + aten::empty_strided 1.51% 33.481us 1.51% 33.481us 5.580us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.49% 55.161us 2.49% 55.161us 9.194us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.38% 30.530us 1.77% 39.222us 3.268us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.39% 8.692us 0.39% 8.692us 0.724us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.80% 39.760us 1.80% 39.760us 6.627us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.870us 0.27% 5.870us 5.870us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.921ms -Self CUDA time total: 23.966us +Self CPU time total: 2.212ms +Self CUDA time total: 24.098us @@ -4279,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.421us 1391.81% 339.421us 339.421us 1 - hf_kernels_rotary 9.18% 172.926us 99.76% 1.879ms 1.879ms 0.000us 0.00% 25.699us 25.699us 1 - _rotary_dba7d1e::apply_rotary 2.20% 41.409us 4.51% 85.000us 14.167us 16.481us 67.58% 16.481us 2.747us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.481us 67.58% 16.481us 2.747us 6 - aten::clone 1.46% 27.581us 83.73% 1.577ms 262.862us 0.000us 0.00% 9.218us 1.536us 6 - aten::copy_ 1.97% 37.091us 80.45% 1.515ms 252.563us 7.906us 32.42% 9.218us 1.536us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.906us 32.42% 7.906us 1.318us 6 - Activity Buffer Request 75.71% 1.426ms 75.71% 1.426ms 1.426ms 1.312us 5.38% 1.312us 1.312us 1 - aten::empty_strided 1.82% 34.210us 1.82% 34.210us 5.702us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.77% 52.231us 2.77% 52.231us 8.705us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.80% 33.892us 2.33% 43.952us 3.663us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.53% 10.060us 0.53% 10.060us 0.838us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.31% 43.591us 2.31% 43.591us 7.265us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.550us 0.24% 4.550us 4.550us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 333.020us 1374.81% 333.020us 333.020us 1 + hf_kernels_rotary 8.22% 183.662us 99.77% 2.229ms 2.229ms 0.000us 0.00% 25.535us 25.535us 1 + _rotary_dba7d1e::apply_rotary 1.78% 39.771us 3.54% 79.142us 13.190us 16.479us 68.03% 16.479us 2.747us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 68.03% 16.479us 2.747us 6 + aten::clone 1.23% 27.502us 86.14% 1.925ms 320.808us 0.000us 0.00% 9.056us 1.509us 6 + aten::copy_ 1.51% 33.780us 83.43% 1.864ms 310.723us 7.744us 31.97% 9.056us 1.509us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 31.97% 7.744us 1.291us 6 + Activity Buffer Request 79.60% 1.779ms 79.60% 1.779ms 1.779ms 1.312us 5.42% 1.312us 1.312us 1 + aten::empty_strided 1.48% 33.009us 1.48% 33.009us 5.501us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.32% 51.921us 2.32% 51.921us 8.654us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.44% 32.260us 1.87% 41.742us 3.478us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.42% 9.482us 0.42% 9.482us 0.790us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.76% 39.371us 1.76% 39.371us 6.562us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 5.150us 0.23% 5.150us 5.150us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.884ms -Self CUDA time total: 24.387us +Self CPU time total: 2.235ms +Self CUDA time total: 24.223us @@ -4305,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.466us 1252.36% 353.466us 353.466us 1 - hf_kernels_rotary 8.35% 176.747us 99.76% 2.111ms 2.111ms 0.000us 0.00% 30.048us 30.048us 1 - _rotary_dba7d1e::apply_rotary 2.17% 45.850us 4.21% 89.000us 14.833us 17.664us 62.59% 17.664us 2.944us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 62.59% 17.664us 2.944us 6 - aten::clone 1.36% 28.714us 85.13% 1.802ms 300.274us 0.000us 0.00% 12.384us 2.064us 6 - aten::copy_ 1.83% 38.751us 82.20% 1.740ms 289.944us 10.560us 37.41% 12.384us 2.064us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 37.41% 10.560us 1.760us 6 - Activity Buffer Request 67.60% 1.431ms 67.60% 1.431ms 1.431ms 1.824us 6.46% 1.824us 1.824us 1 - aten::empty_strided 1.57% 33.269us 1.57% 33.269us 5.545us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 12.77% 270.306us 12.77% 270.306us 45.051us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.59% 33.568us 2.07% 43.911us 3.659us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 10.343us 0.49% 10.343us 0.862us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.04% 43.150us 2.04% 43.150us 7.192us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 5.130us 0.24% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.396us 1170.66% 330.396us 330.396us 1 + hf_kernels_rotary 19.88% 180.354us 99.43% 901.975us 901.975us 0.000us 0.00% 29.983us 29.983us 1 + _rotary_dba7d1e::apply_rotary 4.33% 39.273us 8.60% 78.013us 13.002us 17.759us 62.92% 17.759us 2.960us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.759us 62.92% 17.759us 2.960us 6 + aten::clone 2.43% 22.040us 66.64% 604.579us 100.763us 0.000us 0.00% 12.224us 2.037us 6 + aten::copy_ 3.81% 34.600us 60.79% 551.459us 91.910us 10.464us 37.08% 12.224us 2.037us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 37.08% 10.464us 1.744us 6 + Activity Buffer Request 27.63% 250.684us 27.63% 250.684us 250.684us 1.760us 6.24% 1.760us 1.760us 1 + aten::empty_strided 3.43% 31.080us 3.43% 31.080us 5.180us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 29.34% 266.175us 29.34% 266.175us 44.362us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.36% 30.489us 4.30% 39.029us 3.252us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.94% 8.540us 0.94% 8.540us 0.712us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.27% 38.740us 4.27% 38.740us 6.457us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.57% 5.209us 0.57% 5.209us 5.209us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.116ms -Self CUDA time total: 28.224us +Self CPU time total: 907.184us +Self CUDA time total: 28.223us @@ -4331,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.740us 1444.46% 351.740us 351.740us 1 - hf_kernels_rotary 8.68% 176.155us 99.77% 2.024ms 2.024ms 0.000us 0.00% 25.663us 25.663us 1 - _rotary_dba7d1e::apply_rotary 2.27% 46.099us 4.32% 87.680us 14.613us 16.479us 67.67% 16.479us 2.747us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 67.67% 16.479us 2.747us 6 - aten::clone 1.42% 28.832us 84.62% 1.717ms 286.091us 0.000us 0.00% 9.184us 1.531us 6 - aten::copy_ 1.86% 37.831us 81.49% 1.653ms 275.519us 7.872us 32.33% 9.184us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 32.33% 7.872us 1.312us 6 - Activity Buffer Request 70.03% 1.420ms 70.03% 1.420ms 1.420ms 1.312us 5.39% 1.312us 1.312us 1 - aten::empty_strided 1.71% 34.601us 1.71% 34.601us 5.767us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.60% 194.784us 9.60% 194.784us 32.464us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.63% 33.102us 2.14% 43.512us 3.626us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.51% 10.410us 0.51% 10.410us 0.867us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.05% 41.581us 2.05% 41.581us 6.930us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.660us 0.23% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 333.950us 1371.35% 333.950us 333.950us 1 + hf_kernels_rotary 7.53% 182.915us 99.79% 2.425ms 2.425ms 0.000us 0.00% 25.664us 25.664us 1 + _rotary_dba7d1e::apply_rotary 1.65% 40.000us 3.26% 79.130us 13.188us 16.545us 67.94% 16.545us 2.758us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.545us 67.94% 16.545us 2.758us 6 + aten::clone 1.26% 30.642us 87.34% 2.122ms 353.721us 0.000us 0.00% 9.119us 1.520us 6 + aten::copy_ 1.47% 35.799us 84.75% 2.059ms 343.229us 7.807us 32.06% 9.119us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 32.06% 7.807us 1.301us 6 + Activity Buffer Request 73.06% 1.775ms 73.06% 1.775ms 1.775ms 1.312us 5.39% 1.312us 1.312us 1 + aten::empty_strided 1.33% 32.310us 1.33% 32.310us 5.385us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.22% 248.434us 10.22% 248.434us 41.406us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.31% 31.720us 1.66% 40.370us 3.364us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.650us 0.36% 8.650us 0.721us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.61% 39.130us 1.61% 39.130us 6.522us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 5.100us 0.21% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.029ms -Self CUDA time total: 24.351us +Self CPU time total: 2.430ms +Self CUDA time total: 24.352us @@ -4357,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.111us 1238.38% 349.111us 349.111us 1 - hf_kernels_rotary 23.24% 192.013us 99.32% 820.571us 820.571us 0.000us 0.00% 30.015us 30.015us 1 - _rotary_dba7d1e::apply_rotary 5.42% 44.795us 10.63% 87.866us 14.644us 17.632us 62.54% 17.632us 2.939us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.632us 62.54% 17.632us 2.939us 6 - aten::clone 2.69% 22.223us 60.09% 496.442us 82.740us 0.000us 0.00% 12.383us 2.064us 6 - aten::copy_ 4.60% 38.000us 53.48% 441.890us 73.648us 10.559us 37.46% 12.383us 2.064us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.559us 37.46% 10.559us 1.760us 6 - Activity Buffer Request 26.48% 218.816us 26.48% 218.816us 218.816us 1.824us 6.47% 1.824us 1.824us 1 - aten::empty_strided 3.91% 32.329us 3.91% 32.329us 5.388us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.40% 185.074us 22.40% 185.074us 30.846us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.04% 33.410us 5.36% 44.250us 3.688us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.31% 10.840us 1.31% 10.840us 0.903us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.21% 43.071us 5.21% 43.071us 7.178us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.68% 5.641us 0.68% 5.641us 5.641us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.717us 1169.19% 330.717us 330.717us 1 + hf_kernels_rotary 7.60% 182.573us 99.80% 2.396ms 2.396ms 0.000us 0.00% 30.046us 30.046us 1 + _rotary_dba7d1e::apply_rotary 1.66% 39.960us 3.28% 78.811us 13.135us 17.758us 62.78% 17.758us 2.960us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.758us 62.78% 17.758us 2.960us 6 + aten::clone 1.18% 28.252us 87.25% 2.095ms 349.108us 0.000us 0.00% 12.288us 2.048us 6 + aten::copy_ 1.56% 37.480us 84.78% 2.035ms 339.209us 10.528us 37.22% 12.288us 2.048us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 37.22% 10.528us 1.755us 6 + Activity Buffer Request 73.02% 1.753ms 73.02% 1.753ms 1.753ms 1.760us 6.22% 1.760us 1.760us 1 + aten::empty_strided 1.30% 31.140us 1.30% 31.140us 5.190us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.19% 244.675us 10.19% 244.675us 40.779us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.30% 31.158us 1.66% 39.899us 3.325us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.741us 0.36% 8.741us 0.728us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.62% 38.851us 1.62% 38.851us 6.475us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.20% 4.770us 0.20% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 826.212us -Self CUDA time total: 28.191us +Self CPU time total: 2.401ms +Self CUDA time total: 28.286us @@ -4383,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.984us 852.93% 344.984us 344.984us 1 - hf_kernels_rotary 22.02% 168.975us 99.39% 762.759us 762.759us 0.000us 0.00% 43.263us 43.263us 1 - _rotary_dba7d1e::apply_rotary 5.75% 44.162us 11.18% 85.802us 14.300us 23.456us 57.99% 23.456us 3.909us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.456us 57.99% 23.456us 3.909us 6 - aten::clone 2.91% 22.350us 60.45% 463.932us 77.322us 0.000us 0.00% 19.807us 3.301us 6 - aten::copy_ 4.98% 38.249us 53.45% 410.170us 68.362us 16.991us 42.01% 19.807us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.991us 42.01% 16.991us 2.832us 6 - Activity Buffer Request 24.55% 188.395us 24.55% 188.395us 188.395us 2.816us 6.96% 2.816us 2.816us 1 - aten::empty_strided 4.09% 31.412us 4.09% 31.412us 5.235us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.91% 183.526us 23.91% 183.526us 30.588us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.40% 33.790us 5.74% 44.050us 3.671us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.34% 10.260us 1.34% 10.260us 0.855us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.43% 41.640us 5.43% 41.640us 6.940us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.61% 4.661us 0.61% 4.661us 4.661us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.263us 811.96% 331.263us 331.263us 1 + hf_kernels_rotary 7.62% 179.163us 99.79% 2.346ms 2.346ms 0.000us 0.00% 43.646us 43.646us 1 + _rotary_dba7d1e::apply_rotary 1.67% 39.309us 3.29% 77.411us 12.902us 23.680us 58.04% 23.680us 3.947us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.680us 58.04% 23.680us 3.947us 6 + aten::clone 1.17% 27.469us 87.14% 2.049ms 341.486us 0.000us 0.00% 19.966us 3.328us 6 + aten::copy_ 1.49% 35.141us 84.62% 1.990ms 331.589us 17.118us 41.96% 19.966us 3.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.118us 41.96% 17.118us 2.853us 6 + Activity Buffer Request 73.01% 1.717ms 73.01% 1.717ms 1.717ms 2.848us 6.98% 2.848us 2.848us 1 + aten::empty_strided 1.36% 31.912us 1.36% 31.912us 5.319us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.11% 237.764us 10.11% 237.764us 39.627us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.35% 31.810us 1.74% 40.800us 3.400us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.38% 8.990us 0.38% 8.990us 0.749us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.62% 38.102us 1.62% 38.102us 6.350us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 4.871us 0.21% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 767.420us -Self CUDA time total: 40.447us +Self CPU time total: 2.351ms +Self CUDA time total: 40.798us @@ -4409,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.453us 442.64% 347.453us 347.453us 1 - hf_kernels_rotary 20.37% 160.826us 99.39% 784.751us 784.751us 0.000us 0.00% 91.040us 91.040us 1 - aten::clone 2.83% 22.340us 62.44% 492.983us 82.164us 0.000us 0.00% 52.865us 8.811us 6 - aten::copy_ 4.65% 36.740us 55.30% 436.663us 72.777us 40.321us 51.37% 52.865us 8.811us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 40.321us 51.37% 40.321us 6.720us 6 - _rotary_dba7d1e::apply_rotary 5.74% 45.350us 11.00% 86.891us 14.482us 38.175us 48.63% 38.175us 6.362us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.175us 48.63% 38.175us 6.362us 6 - Activity Buffer Request 27.86% 219.946us 27.86% 219.946us 219.946us 12.544us 15.98% 12.544us 12.544us 1 - aten::empty_strided 4.30% 33.980us 4.30% 33.980us 5.663us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.79% 179.977us 22.79% 179.977us 29.996us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.35% 34.361us 5.58% 44.051us 3.671us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.23% 9.690us 1.23% 9.690us 0.808us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.26% 41.541us 5.26% 41.541us 6.924us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.61% 4.830us 0.61% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.387us 451.94% 336.387us 336.387us 1 + hf_kernels_rotary 7.84% 184.420us 99.78% 2.346ms 2.346ms 0.000us 0.00% 82.976us 82.976us 1 + aten::clone 1.21% 28.560us 86.97% 2.045ms 340.779us 0.000us 0.00% 43.553us 7.259us 6 + aten::copy_ 1.54% 36.092us 84.34% 1.983ms 330.495us 35.009us 47.03% 43.553us 7.259us 6 + _rotary_dba7d1e::apply_rotary 1.67% 39.331us 3.28% 77.091us 12.849us 39.423us 52.97% 39.423us 6.571us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.423us 52.97% 39.423us 6.571us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.009us 47.03% 35.009us 5.835us 6 + Activity Buffer Request 73.02% 1.717ms 73.02% 1.717ms 1.717ms 8.544us 11.48% 8.544us 8.544us 1 + aten::empty_strided 1.41% 33.141us 1.41% 33.141us 5.523us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.79% 230.064us 9.79% 230.064us 38.344us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.34% 31.492us 1.69% 39.832us 3.319us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.35% 8.340us 0.35% 8.340us 0.695us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.61% 37.760us 1.61% 37.760us 6.293us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.070us 0.22% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 789.581us -Self CUDA time total: 78.496us +Self CPU time total: 2.351ms +Self CUDA time total: 74.432us @@ -4435,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.324us 858.06% 347.324us 347.324us 1 - hf_kernels_rotary 8.65% 173.958us 99.77% 2.007ms 2.007ms 0.000us 0.00% 43.325us 43.325us 1 - _rotary_dba7d1e::apply_rotary 2.18% 43.910us 4.21% 84.770us 14.128us 23.423us 57.87% 23.423us 3.904us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.423us 57.87% 23.423us 3.904us 6 - aten::clone 1.35% 27.211us 84.83% 1.706ms 284.405us 0.000us 0.00% 19.902us 3.317us 6 - aten::copy_ 1.92% 38.681us 81.76% 1.645ms 274.138us 17.055us 42.13% 19.902us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 42.13% 17.055us 2.842us 6 - Activity Buffer Request 70.68% 1.422ms 70.68% 1.422ms 1.422ms 2.847us 7.03% 2.847us 2.847us 1 - aten::empty_strided 1.71% 34.392us 1.71% 34.392us 5.732us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.16% 184.363us 9.16% 184.363us 30.727us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.62% 32.593us 2.08% 41.861us 3.488us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.46% 9.268us 0.46% 9.268us 0.772us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.03% 40.860us 2.03% 40.860us 6.810us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.670us 0.23% 4.670us 4.670us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.720us 824.27% 334.720us 334.720us 1 + hf_kernels_rotary 7.69% 178.052us 99.76% 2.310ms 2.310ms 0.000us 0.00% 43.488us 43.488us 1 + _rotary_dba7d1e::apply_rotary 1.77% 40.921us 3.42% 79.272us 13.212us 23.680us 58.31% 23.680us 3.947us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.680us 58.31% 23.680us 3.947us 6 + aten::clone 1.23% 28.463us 86.92% 2.013ms 335.521us 0.000us 0.00% 19.808us 3.301us 6 + aten::copy_ 1.52% 35.247us 84.34% 1.953ms 325.533us 16.928us 41.69% 19.808us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 41.69% 16.928us 2.821us 6 + Activity Buffer Request 73.01% 1.691ms 73.01% 1.691ms 1.691ms 2.880us 7.09% 2.880us 2.880us 1 + aten::empty_strided 1.36% 31.460us 1.36% 31.460us 5.243us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.81% 227.126us 9.81% 227.126us 37.854us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.37% 31.801us 1.73% 40.020us 3.335us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.35% 8.219us 0.35% 8.219us 0.685us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.66% 38.351us 1.66% 38.351us 6.392us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.500us 0.24% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.012ms -Self CUDA time total: 40.478us +Self CPU time total: 2.316ms +Self CUDA time total: 40.608us @@ -4461,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.785us 476.45% 361.785us 361.785us 1 - hf_kernels_rotary 8.64% 176.662us 99.77% 2.040ms 2.040ms 0.000us 0.00% 86.685us 86.685us 1 - aten::clone 1.40% 28.682us 84.64% 1.731ms 288.486us 0.000us 0.00% 47.871us 7.979us 6 - aten::copy_ 1.80% 36.737us 81.55% 1.668ms 277.962us 37.119us 48.88% 47.871us 7.979us 6 - _rotary_dba7d1e::apply_rotary 2.24% 45.910us 4.34% 88.820us 14.803us 38.814us 51.12% 38.814us 6.469us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.814us 51.12% 38.814us 6.469us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.119us 48.88% 37.119us 6.187us 6 - Activity Buffer Request 70.82% 1.448ms 70.82% 1.448ms 1.448ms 10.752us 14.16% 10.752us 10.752us 1 - aten::empty_strided 1.69% 34.462us 1.69% 34.462us 5.744us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.93% 182.677us 8.93% 182.677us 30.446us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.66% 33.994us 2.15% 43.925us 3.660us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 9.931us 0.49% 9.931us 0.828us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.10% 42.910us 2.10% 42.910us 7.152us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.670us 0.23% 4.670us 4.670us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 343.357us 451.99% 343.357us 343.357us 1 + hf_kernels_rotary 7.23% 182.803us 99.81% 2.522ms 2.522ms 0.000us 0.00% 85.341us 85.341us 1 + aten::clone 1.16% 29.441us 87.88% 2.221ms 370.131us 0.000us 0.00% 46.013us 7.669us 6 + aten::copy_ 1.42% 35.932us 85.39% 2.158ms 359.654us 36.637us 48.23% 46.013us 7.669us 6 + _rotary_dba7d1e::apply_rotary 1.58% 39.950us 3.09% 78.111us 13.018us 39.328us 51.77% 39.328us 6.555us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.328us 51.77% 39.328us 6.555us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 36.637us 48.23% 36.637us 6.106us 6 + Activity Buffer Request 75.16% 1.899ms 75.16% 1.899ms 1.899ms 9.376us 12.34% 9.376us 9.376us 1 + aten::empty_strided 1.32% 33.420us 1.32% 33.420us 5.570us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.81% 222.633us 8.81% 222.633us 37.105us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.25% 31.613us 1.61% 40.701us 3.392us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 9.088us 0.36% 9.088us 0.757us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.51% 38.161us 1.51% 38.161us 6.360us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.19% 4.790us 0.19% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.045ms -Self CUDA time total: 75.933us +Self CPU time total: 2.527ms +Self CUDA time total: 75.965us @@ -4487,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 373.629us 268.97% 373.629us 373.629us 1 - hf_kernels_rotary 8.95% 179.578us 99.78% 2.002ms 2.002ms 0.000us 0.00% 162.750us 162.750us 1 - aten::clone 1.48% 29.597us 83.94% 1.684ms 280.680us 0.000us 0.00% 102.944us 17.157us 6 - aten::copy_ 1.82% 36.553us 80.73% 1.620ms 269.962us 79.104us 56.95% 102.944us 17.157us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 79.104us 56.95% 79.104us 13.184us 6 - _rotary_dba7d1e::apply_rotary 2.30% 46.131us 4.57% 91.713us 15.285us 59.806us 43.05% 59.806us 9.968us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 59.806us 43.05% 59.806us 9.968us 6 - Activity Buffer Request 69.91% 1.403ms 69.91% 1.403ms 1.403ms 23.840us 17.16% 23.840us 23.840us 1 - aten::empty_strided 1.73% 34.712us 1.73% 34.712us 5.785us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.00% 180.563us 9.00% 180.563us 30.094us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.75% 35.198us 2.31% 46.409us 3.867us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.56% 11.211us 0.56% 11.211us 0.934us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.27% 45.582us 2.27% 45.582us 7.597us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 4.510us 0.22% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.487us 241.29% 335.487us 335.487us 1 + hf_kernels_rotary 7.48% 174.562us 99.79% 2.329ms 2.329ms 0.000us 0.00% 162.718us 162.718us 1 + aten::clone 1.24% 29.010us 87.24% 2.036ms 339.299us 0.000us 0.00% 102.494us 17.082us 6 + aten::copy_ 1.51% 35.312us 84.60% 1.974ms 329.037us 78.815us 56.69% 102.494us 17.082us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.815us 56.69% 78.815us 13.136us 6 + _rotary_dba7d1e::apply_rotary 1.71% 39.800us 3.37% 78.741us 13.124us 60.224us 43.31% 60.224us 10.037us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.224us 43.31% 60.224us 10.037us 6 + Activity Buffer Request 73.92% 1.725ms 73.92% 1.725ms 1.725ms 23.679us 17.03% 23.679us 23.679us 1 + aten::empty_strided 1.40% 32.561us 1.40% 32.561us 5.427us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.17% 213.963us 9.17% 213.963us 35.660us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.33% 31.050us 1.69% 39.471us 3.289us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.421us 0.36% 8.421us 0.702us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.67% 38.941us 1.67% 38.941us 6.490us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 4.971us 0.21% 4.971us 4.971us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.006ms -Self CUDA time total: 138.910us +Self CPU time total: 2.334ms +Self CUDA time total: 139.039us @@ -4513,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 7.56% 177.196us 86.68% 2.032ms 2.032ms 0.000us 0.00% 778.402us 778.402us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 717.248us 101.07% 717.248us 717.248us 1 - aten::clone 1.23% 28.772us 72.98% 1.711ms 285.141us 0.000us 0.00% 578.626us 96.438us 6 - aten::copy_ 1.64% 38.341us 70.23% 1.646ms 274.415us 509.889us 71.85% 578.626us 96.438us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 509.889us 71.85% 509.889us 84.982us 6 - _rotary_dba7d1e::apply_rotary 2.34% 54.801us 4.25% 99.591us 16.598us 199.776us 28.15% 199.776us 33.296us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 199.776us 28.15% 199.776us 33.296us 6 - Activity Buffer Request 60.86% 1.427ms 60.86% 1.427ms 1.427ms 68.737us 9.69% 68.737us 68.737us 1 - aten::empty_strided 1.52% 35.581us 1.52% 35.581us 5.930us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.74% 181.435us 7.74% 181.435us 30.239us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.41% 33.151us 1.89% 44.330us 3.694us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 11.179us 0.48% 11.179us 0.932us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.91% 44.790us 1.91% 44.790us 7.465us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 13.32% 312.348us 13.32% 312.348us 312.348us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 13.11% 152.482us 70.07% 814.833us 814.833us 0.000us 0.00% 767.862us 767.862us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 709.398us 101.13% 709.398us 709.398us 1 + aten::clone 1.92% 22.371us 46.79% 544.150us 90.692us 0.000us 0.00% 567.671us 94.612us 6 + aten::copy_ 3.06% 35.584us 42.24% 491.229us 81.872us 501.304us 71.46% 567.671us 94.612us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 501.304us 71.46% 501.304us 83.551us 6 + _rotary_dba7d1e::apply_rotary 3.52% 40.960us 6.87% 79.901us 13.317us 200.191us 28.54% 200.191us 33.365us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 200.191us 28.54% 200.191us 33.365us 6 + Activity Buffer Request 20.99% 244.144us 20.99% 244.144us 244.144us 66.367us 9.46% 66.367us 66.367us 1 + aten::empty_strided 2.63% 30.550us 2.63% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.19% 211.501us 18.19% 211.501us 35.250us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.57% 29.881us 3.29% 38.300us 3.192us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.72% 8.419us 0.72% 8.419us 0.702us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.35% 38.941us 3.35% 38.941us 6.490us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 29.93% 348.096us 29.93% 348.096us 348.096us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.344ms -Self CUDA time total: 709.665us +Self CPU time total: 1.163ms +Self CUDA time total: 701.495us @@ -4539,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.629us 1313.11% 349.629us 349.629us 1 - hf_kernels_rotary 8.75% 174.875us 99.76% 1.994ms 1.994ms 0.000us 0.00% 27.938us 27.938us 1 - _rotary_dba7d1e::apply_rotary 2.16% 43.200us 4.40% 87.900us 14.650us 18.754us 70.43% 18.754us 3.126us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.754us 70.43% 18.754us 3.126us 6 - aten::clone 1.44% 28.720us 84.48% 1.688ms 281.365us 0.000us 0.00% 9.184us 1.531us 6 - aten::copy_ 1.82% 36.432us 81.36% 1.626ms 271.003us 7.872us 29.57% 9.184us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 29.57% 7.872us 1.312us 6 - Activity Buffer Request 70.53% 1.410ms 70.53% 1.410ms 1.410ms 1.312us 4.93% 1.312us 1.312us 1 - aten::empty_strided 1.67% 33.452us 1.67% 33.452us 5.575us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.01% 180.083us 9.01% 180.083us 30.014us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.63% 32.560us 2.14% 42.684us 3.557us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.51% 10.124us 0.51% 10.124us 0.844us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.24% 44.700us 2.24% 44.700us 7.450us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.780us 0.24% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.016us 1225.99% 326.016us 326.016us 1 + hf_kernels_rotary 18.50% 152.323us 99.40% 818.663us 818.663us 0.000us 0.00% 27.904us 27.904us 1 + _rotary_dba7d1e::apply_rotary 4.86% 40.039us 9.57% 78.850us 13.142us 18.752us 70.52% 18.752us 3.125us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.752us 70.52% 18.752us 3.125us 6 + aten::clone 2.56% 21.061us 66.62% 548.640us 91.440us 0.000us 0.00% 9.152us 1.525us 6 + aten::copy_ 4.19% 34.519us 60.27% 496.387us 82.731us 7.840us 29.48% 9.152us 1.525us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 29.48% 7.840us 1.307us 6 + Activity Buffer Request 29.97% 246.784us 29.97% 246.784us 246.784us 1.312us 4.93% 1.312us 1.312us 1 + aten::empty_strided 3.79% 31.192us 3.79% 31.192us 5.199us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.12% 215.084us 26.12% 215.084us 35.847us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.71% 30.531us 4.72% 38.850us 3.237us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.01% 8.319us 1.01% 8.319us 0.693us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.71% 38.811us 4.71% 38.811us 6.469us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.60% 4.910us 0.60% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.998ms -Self CUDA time total: 26.626us +Self CPU time total: 823.573us +Self CUDA time total: 26.592us @@ -4565,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.698us 1282.22% 344.698us 344.698us 1 - hf_kernels_rotary 22.61% 152.757us 99.23% 670.538us 670.538us 0.000us 0.00% 28.195us 28.195us 1 - _rotary_dba7d1e::apply_rotary 6.64% 44.870us 12.97% 87.630us 14.605us 19.009us 70.71% 19.009us 3.168us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.009us 70.71% 19.009us 3.168us 6 - aten::clone 3.38% 22.839us 57.25% 386.869us 64.478us 0.000us 0.00% 9.186us 1.531us 6 - aten::copy_ 5.63% 38.041us 49.11% 331.829us 55.305us 7.874us 29.29% 9.186us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.874us 29.29% 7.874us 1.312us 6 - Activity Buffer Request 16.48% 111.363us 16.48% 111.363us 111.363us 1.312us 4.88% 1.312us 1.312us 1 - aten::empty_strided 4.77% 32.201us 4.77% 32.201us 5.367us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 27.00% 182.425us 27.00% 182.425us 30.404us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.90% 33.085us 6.41% 43.282us 3.607us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.51% 10.197us 1.51% 10.197us 0.850us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 6.33% 42.760us 6.33% 42.760us 7.127us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.77% 5.200us 0.77% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.263us 1209.82% 323.263us 323.263us 1 + hf_kernels_rotary 17.52% 147.623us 99.42% 837.623us 837.623us 0.000us 0.00% 28.032us 28.032us 1 + _rotary_dba7d1e::apply_rotary 4.62% 38.930us 9.25% 77.941us 12.990us 18.944us 70.90% 18.944us 3.157us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.944us 70.90% 18.944us 3.157us 6 + aten::clone 2.83% 23.880us 68.02% 573.009us 95.502us 0.000us 0.00% 9.088us 1.515us 6 + aten::copy_ 4.05% 34.160us 61.53% 518.397us 86.400us 7.776us 29.10% 9.088us 1.515us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 29.10% 7.776us 1.296us 6 + Activity Buffer Request 32.41% 273.024us 32.41% 273.024us 273.024us 1.312us 4.91% 1.312us 1.312us 1 + aten::empty_strided 3.65% 30.732us 3.65% 30.732us 5.122us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.07% 211.213us 25.07% 211.213us 35.202us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.65% 30.720us 4.64% 39.050us 3.254us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.330us 0.99% 8.330us 0.694us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.63% 39.011us 4.63% 39.011us 6.502us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 4.850us 0.58% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 675.738us -Self CUDA time total: 26.883us +Self CPU time total: 842.473us +Self CUDA time total: 26.720us @@ -4591,22 +4373,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.004us 1141.75% 350.004us 350.004us 1 - hf_kernels_rotary 19.05% 154.214us 99.36% 804.261us 804.261us 0.000us 0.00% 32.414us 32.414us 1 - _rotary_dba7d1e::apply_rotary 5.47% 44.240us 10.98% 88.910us 14.818us 20.064us 65.45% 20.064us 3.344us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.064us 65.45% 20.064us 3.344us 6 - aten::clone 3.02% 24.421us 63.80% 516.433us 86.072us 0.000us 0.00% 12.350us 2.058us 6 - aten::copy_ 4.66% 37.732us 56.69% 458.901us 76.483us 10.591us 34.55% 12.350us 2.058us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.591us 34.55% 10.591us 1.765us 6 - Activity Buffer Request 29.69% 240.306us 29.69% 240.306us 240.306us 1.759us 5.74% 1.759us 1.759us 1 - aten::empty_strided 4.09% 33.111us 4.09% 33.111us 5.518us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.34% 180.863us 22.34% 180.863us 30.144us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.15% 33.594us 5.52% 44.704us 3.725us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.37% 11.110us 1.37% 11.110us 0.926us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.52% 44.670us 5.52% 44.670us 7.445us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.64% 5.201us 0.64% 5.201us 5.201us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.947us 1037.18% 317.947us 317.947us 1 + hf_kernels_rotary 18.00% 147.321us 99.35% 812.963us 812.963us 0.000us 0.00% 32.383us 32.383us 1 + _rotary_dba7d1e::apply_rotary 4.88% 39.901us 9.44% 77.251us 12.875us 20.255us 66.07% 20.255us 3.376us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.255us 66.07% 20.255us 3.376us 6 + aten::clone 2.41% 19.693us 67.19% 549.781us 91.630us 0.000us 0.00% 12.128us 2.021us 6 + aten::copy_ 4.28% 35.023us 61.13% 500.160us 83.360us 10.400us 33.93% 12.128us 2.021us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 33.93% 10.400us 1.733us 6 + Activity Buffer Request 31.00% 253.664us 31.00% 253.664us 253.664us 1.728us 5.64% 1.728us 1.728us 1 + aten::empty_strided 3.66% 29.928us 3.66% 29.928us 4.988us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.84% 211.473us 25.84% 211.473us 35.245us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.72% 30.411us 4.72% 38.610us 3.218us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.00% 8.199us 1.00% 8.199us 0.683us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.56% 37.350us 4.56% 37.350us 6.225us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.289us 0.65% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 809.462us +Self CPU time total: 818.252us Self CUDA time total: 30.655us @@ -4617,23 +4399,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.355us 822.64% 350.355us 350.355us 1 - hf_kernels_rotary 19.55% 155.605us 99.35% 790.981us 790.981us 0.000us 0.00% 45.469us 45.469us 1 - _rotary_dba7d1e::apply_rotary 5.55% 44.191us 11.02% 87.731us 14.622us 25.565us 60.03% 25.565us 4.261us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.565us 60.03% 25.565us 4.261us 6 - aten::clone 2.81% 22.389us 63.13% 502.593us 83.766us 0.000us 0.00% 19.904us 3.317us 6 - aten::copy_ 4.90% 39.043us 56.13% 446.833us 74.472us 17.024us 39.97% 19.904us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 39.97% 17.024us 2.837us 6 - Activity Buffer Request 28.37% 225.886us 28.37% 225.886us 225.886us 2.880us 6.76% 2.880us 2.880us 1 - aten::empty_strided 4.19% 33.371us 4.19% 33.371us 5.562us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.85% 181.904us 22.85% 181.904us 30.317us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.29% 34.142us 5.66% 45.052us 3.754us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.37% 10.910us 1.37% 10.910us 0.909us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.47% 43.540us 5.47% 43.540us 7.257us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.65% 5.140us 0.65% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.769us 777.76% 331.769us 331.769us 1 + hf_kernels_rotary 19.70% 168.549us 99.44% 850.864us 850.864us 0.000us 0.00% 45.537us 45.537us 1 + _rotary_dba7d1e::apply_rotary 4.73% 40.431us 9.19% 78.662us 13.110us 25.697us 60.24% 25.697us 4.283us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.697us 60.24% 25.697us 4.283us 6 + aten::clone 2.97% 25.433us 65.78% 562.881us 93.814us 0.000us 0.00% 19.840us 3.307us 6 + aten::copy_ 4.23% 36.170us 59.14% 506.068us 84.345us 16.960us 39.76% 19.840us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 39.76% 16.960us 2.827us 6 + Activity Buffer Request 30.43% 260.334us 30.43% 260.334us 260.334us 2.880us 6.75% 2.880us 2.880us 1 + aten::empty_strided 3.67% 31.380us 3.67% 31.380us 5.230us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.49% 209.564us 24.49% 209.564us 34.927us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.75% 32.092us 4.77% 40.772us 3.398us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.01% 8.680us 1.01% 8.680us 0.723us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.47% 38.231us 4.47% 38.231us 6.372us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.56% 4.789us 0.56% 4.789us 4.789us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 796.121us -Self CUDA time total: 42.589us +Self CPU time total: 855.653us +Self CUDA time total: 42.657us @@ -4643,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.951us 1133.59% 344.951us 344.951us 1 - hf_kernels_rotary 19.05% 153.418us 99.42% 800.680us 800.680us 0.000us 0.00% 32.125us 32.125us 1 - _rotary_dba7d1e::apply_rotary 5.43% 43.718us 10.83% 87.180us 14.530us 20.095us 66.04% 20.095us 3.349us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.095us 66.04% 20.095us 3.349us 6 - aten::clone 2.75% 22.180us 64.20% 517.012us 86.169us 0.000us 0.00% 12.030us 2.005us 6 - aten::copy_ 4.82% 38.813us 57.22% 460.802us 76.800us 10.335us 33.96% 12.030us 2.005us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.335us 33.96% 10.335us 1.722us 6 - Activity Buffer Request 30.13% 242.666us 30.13% 242.666us 242.666us 1.695us 5.57% 1.695us 1.695us 1 - aten::empty_strided 4.23% 34.030us 4.23% 34.030us 5.672us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.27% 179.323us 22.27% 179.323us 29.887us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.11% 33.131us 5.35% 43.070us 3.589us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.23% 9.939us 1.23% 9.939us 0.828us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.40% 43.462us 5.40% 43.462us 7.244us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.660us 0.58% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 324.568us 1058.74% 324.568us 324.568us 1 + hf_kernels_rotary 19.85% 169.202us 99.36% 847.094us 847.094us 0.000us 0.00% 32.384us 32.384us 1 + _rotary_dba7d1e::apply_rotary 4.69% 39.959us 9.27% 78.991us 13.165us 20.352us 66.39% 20.352us 3.392us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.352us 66.39% 20.352us 3.392us 6 + aten::clone 2.92% 24.890us 65.73% 560.410us 93.402us 0.000us 0.00% 12.032us 2.005us 6 + aten::copy_ 4.20% 35.769us 59.19% 504.659us 84.110us 10.304us 33.61% 12.032us 2.005us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.304us 33.61% 10.304us 1.717us 6 + Activity Buffer Request 30.61% 260.975us 30.61% 260.975us 260.975us 1.728us 5.64% 1.728us 1.728us 1 + aten::empty_strided 3.62% 30.861us 3.62% 30.861us 5.143us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.39% 207.915us 24.39% 207.915us 34.652us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.54% 30.221us 4.51% 38.491us 3.208us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.97% 8.270us 0.97% 8.270us 0.689us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.58% 39.032us 4.58% 39.032us 6.505us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.460us 0.64% 5.460us 5.460us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 805.340us -Self CUDA time total: 30.430us +Self CPU time total: 852.554us +Self CUDA time total: 30.656us @@ -4669,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 358.905us 840.15% 358.905us 358.905us 1 - hf_kernels_rotary 15.26% 159.123us 99.55% 1.038ms 1.038ms 0.000us 0.00% 45.598us 45.598us 1 - _rotary_dba7d1e::apply_rotary 4.27% 44.490us 8.42% 87.790us 14.632us 25.600us 59.93% 25.600us 4.267us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.600us 59.93% 25.600us 4.267us 6 - aten::clone 2.23% 23.211us 71.54% 746.059us 124.343us 0.000us 0.00% 19.998us 3.333us 6 - aten::copy_ 3.70% 38.572us 65.96% 687.817us 114.636us 17.119us 40.07% 19.998us 3.333us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.119us 40.07% 17.119us 2.853us 6 - Activity Buffer Request 44.90% 468.242us 44.90% 468.242us 468.242us 2.879us 6.74% 2.879us 2.879us 1 - aten::empty_strided 3.36% 35.031us 3.36% 35.031us 5.838us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.36% 181.003us 17.36% 181.003us 30.167us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.32% 34.604us 4.33% 45.135us 3.761us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.01% 10.531us 1.01% 10.531us 0.878us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.15% 43.300us 4.15% 43.300us 7.217us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.45% 4.700us 0.45% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.702us 766.04% 328.702us 328.702us 1 + hf_kernels_rotary 18.09% 152.853us 99.33% 839.363us 839.363us 0.000us 0.00% 45.788us 45.788us 1 + _rotary_dba7d1e::apply_rotary 4.68% 39.541us 9.21% 77.782us 12.964us 25.887us 60.33% 25.887us 4.314us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.887us 60.33% 25.887us 4.314us 6 + aten::clone 2.66% 22.468us 67.35% 569.108us 94.851us 0.000us 0.00% 19.901us 3.317us 6 + aten::copy_ 4.16% 35.173us 60.88% 514.450us 85.742us 17.022us 39.67% 19.901us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.022us 39.67% 17.022us 2.837us 6 + Activity Buffer Request 32.07% 270.965us 32.07% 270.965us 270.965us 2.879us 6.71% 2.879us 2.879us 1 + aten::empty_strided 3.81% 32.190us 3.81% 32.190us 5.365us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.65% 208.312us 24.65% 208.312us 34.719us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.71% 31.390us 4.69% 39.620us 3.302us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.97% 8.230us 0.97% 8.230us 0.686us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.53% 38.241us 4.53% 38.241us 6.374us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.67% 5.631us 0.67% 5.631us 5.631us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.043ms -Self CUDA time total: 42.719us +Self CPU time total: 844.994us +Self CUDA time total: 42.909us @@ -4695,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 383.638us 432.19% 383.638us 383.638us 1 - hf_kernels_rotary 19.20% 158.364us 99.38% 819.611us 819.611us 0.000us 0.00% 103.870us 103.870us 1 - aten::clone 2.74% 22.581us 61.51% 507.313us 84.552us 0.000us 0.00% 63.135us 10.522us 6 - aten::copy_ 4.83% 39.811us 54.76% 451.622us 75.270us 48.031us 54.11% 63.135us 10.522us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 48.031us 54.11% 48.031us 8.005us 6 - _rotary_dba7d1e::apply_rotary 5.49% 45.243us 13.16% 108.504us 18.084us 40.735us 45.89% 40.735us 6.789us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 40.735us 45.89% 40.735us 6.789us 6 - Activity Buffer Request 27.50% 226.825us 27.50% 226.825us 226.825us 15.104us 17.02% 15.104us 15.104us 1 - aten::empty_strided 4.01% 33.110us 4.01% 33.110us 5.518us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.43% 184.986us 22.43% 184.986us 30.831us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.25% 35.021us 5.51% 45.430us 3.786us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.26% 10.409us 1.26% 10.409us 0.867us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 7.67% 63.261us 7.67% 63.261us 10.543us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 5.141us 0.62% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 337.246us 364.66% 337.246us 337.246us 1 + hf_kernels_rotary 7.43% 178.431us 99.78% 2.398ms 2.398ms 0.000us 0.00% 107.425us 107.425us 1 + aten::clone 1.14% 27.439us 87.31% 2.098ms 349.642us 0.000us 0.00% 65.823us 10.970us 6 + aten::copy_ 1.39% 33.333us 84.85% 2.039ms 339.779us 50.880us 55.02% 65.823us 10.970us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 50.880us 55.02% 50.880us 8.480us 6 + _rotary_dba7d1e::apply_rotary 1.70% 40.740us 3.29% 79.070us 13.178us 41.602us 44.98% 41.602us 6.934us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.602us 44.98% 41.602us 6.934us 6 + Activity Buffer Request 74.72% 1.795ms 74.72% 1.795ms 1.795ms 14.943us 16.16% 14.943us 14.943us 1 + aten::empty_strided 1.32% 31.741us 1.32% 31.741us 5.290us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.74% 209.903us 8.74% 209.903us 34.984us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.35% 32.344us 1.76% 42.183us 3.515us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.41% 9.839us 0.41% 9.839us 0.820us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.60% 38.330us 1.60% 38.330us 6.388us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.280us 0.22% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 824.752us -Self CUDA time total: 88.766us +Self CPU time total: 2.403ms +Self CUDA time total: 92.482us @@ -4721,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 359.259us 247.18% 359.259us 359.259us 1 - hf_kernels_rotary 19.06% 158.337us 99.39% 825.781us 825.781us 0.000us 0.00% 168.829us 168.829us 1 - aten::clone 2.83% 23.549us 64.09% 532.493us 88.749us 0.000us 0.00% 105.470us 17.578us 6 - aten::copy_ 4.58% 38.013us 57.29% 475.972us 79.329us 81.982us 56.41% 105.470us 17.578us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.982us 56.41% 81.982us 13.664us 6 - _rotary_dba7d1e::apply_rotary 5.47% 45.451us 10.86% 90.251us 15.042us 63.359us 43.59% 63.359us 10.560us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.359us 43.59% 63.359us 10.560us 6 - Activity Buffer Request 31.29% 259.966us 31.29% 259.966us 259.966us 23.488us 16.16% 23.488us 23.488us 1 - aten::empty_strided 3.97% 32.972us 3.97% 32.972us 5.495us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.42% 177.993us 21.42% 177.993us 29.665us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.19% 34.839us 5.38% 44.700us 3.725us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.19% 9.861us 1.19% 9.861us 0.822us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.39% 44.800us 5.39% 44.800us 7.467us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.61% 5.100us 0.61% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.357us 227.98% 331.357us 331.357us 1 + hf_kernels_rotary 19.22% 153.403us 99.38% 793.253us 793.253us 0.000us 0.00% 169.054us 169.054us 1 + aten::clone 2.47% 19.681us 65.33% 521.479us 86.913us 0.000us 0.00% 105.151us 17.525us 6 + aten::copy_ 4.41% 35.219us 59.11% 471.788us 78.631us 81.439us 56.03% 105.151us 17.525us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.439us 56.03% 81.439us 13.573us 6 + _rotary_dba7d1e::apply_rotary 5.09% 40.640us 9.93% 79.270us 13.212us 63.903us 43.97% 63.903us 10.650us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.903us 43.97% 63.903us 10.650us 6 + Activity Buffer Request 29.11% 232.364us 29.11% 232.364us 232.364us 23.712us 16.31% 23.712us 23.712us 1 + aten::empty_strided 3.76% 30.010us 3.76% 30.010us 5.002us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.58% 204.205us 25.58% 204.205us 34.034us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.78% 30.171us 4.90% 39.101us 3.258us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.12% 8.930us 1.12% 8.930us 0.744us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.84% 38.630us 4.84% 38.630us 6.438us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 4.940us 0.62% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 830.881us -Self CUDA time total: 145.341us +Self CPU time total: 798.193us +Self CUDA time total: 145.342us @@ -4747,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 385.725us 509.05% 385.725us 385.725us 1 - hf_kernels_rotary 8.62% 176.456us 99.78% 2.043ms 2.043ms 0.000us 0.00% 82.558us 82.558us 1 - _rotary_dba7d1e::apply_rotary 2.32% 47.603us 4.41% 90.273us 15.045us 41.694us 55.02% 41.694us 6.949us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.694us 55.02% 41.694us 6.949us 6 - aten::clone 1.42% 29.000us 84.54% 1.731ms 288.534us 0.000us 0.00% 40.864us 6.811us 6 - aten::copy_ 1.93% 39.552us 80.14% 1.641ms 273.497us 34.080us 44.98% 40.864us 6.811us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 34.080us 44.98% 34.080us 5.680us 6 - Activity Buffer Request 69.16% 1.416ms 69.16% 1.416ms 1.416ms 6.784us 8.95% 6.784us 6.784us 1 - aten::empty_strided 2.99% 61.221us 2.99% 61.221us 10.204us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.05% 185.224us 9.05% 185.224us 30.871us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.69% 34.591us 2.21% 45.260us 3.772us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.52% 10.669us 0.52% 10.669us 0.889us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.08% 42.670us 2.08% 42.670us 7.112us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 4.530us 0.22% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 327.384us 410.23% 327.384us 327.384us 1 + hf_kernels_rotary 18.75% 148.421us 99.39% 786.852us 786.852us 0.000us 0.00% 89.981us 89.981us 1 + aten::clone 2.67% 21.153us 65.81% 521.010us 86.835us 0.000us 0.00% 47.613us 7.935us 6 + aten::copy_ 4.62% 36.560us 59.19% 468.587us 78.098us 37.437us 46.91% 47.613us 7.935us 6 + _rotary_dba7d1e::apply_rotary 5.10% 40.369us 9.95% 78.790us 13.132us 42.368us 53.09% 42.368us 7.061us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.368us 53.09% 42.368us 7.061us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.437us 46.91% 37.437us 6.240us 6 + Activity Buffer Request 28.86% 228.474us 28.86% 228.474us 228.474us 10.176us 12.75% 10.176us 10.176us 1 + aten::empty_strided 3.95% 31.270us 3.95% 31.270us 5.212us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.71% 203.553us 25.71% 203.553us 33.925us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.86% 30.542us 4.88% 38.631us 3.219us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.02% 8.089us 1.02% 8.089us 0.674us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.85% 38.421us 4.85% 38.421us 6.403us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.61% 4.869us 0.61% 4.869us 4.869us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.048ms -Self CUDA time total: 75.774us +Self CPU time total: 791.721us +Self CUDA time total: 79.805us @@ -4773,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 368.925us 253.94% 368.925us 368.925us 1 - hf_kernels_rotary 8.62% 177.641us 99.74% 2.055ms 2.055ms 0.000us 0.00% 169.118us 169.118us 1 - aten::clone 1.42% 29.322us 84.62% 1.743ms 290.539us 0.000us 0.00% 105.470us 17.578us 6 - aten::copy_ 1.92% 39.462us 81.52% 1.679ms 279.897us 81.631us 56.19% 105.470us 17.578us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.631us 56.19% 81.631us 13.605us 6 - _rotary_dba7d1e::apply_rotary 2.27% 46.683us 4.40% 90.665us 15.111us 63.648us 43.81% 63.648us 10.608us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.648us 43.81% 63.648us 10.608us 6 - Activity Buffer Request 70.79% 1.458ms 70.79% 1.458ms 1.458ms 23.839us 16.41% 23.839us 23.839us 1 - aten::empty_strided 1.68% 34.530us 1.68% 34.530us 5.755us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.81% 181.504us 8.81% 181.504us 30.251us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.62% 33.289us 2.09% 43.080us 3.590us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.791us 0.48% 9.791us 0.816us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.13% 43.982us 2.13% 43.982us 7.330us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.450us 0.26% 5.450us 5.450us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.133us 229.04% 334.133us 334.133us 1 + hf_kernels_rotary 18.91% 152.747us 99.33% 802.303us 802.303us 0.000us 0.00% 169.593us 169.593us 1 + aten::clone 2.63% 21.282us 65.81% 531.500us 88.583us 0.000us 0.00% 105.244us 17.541us 6 + aten::copy_ 4.22% 34.070us 59.15% 477.709us 79.618us 81.533us 55.89% 105.244us 17.541us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.533us 55.89% 81.533us 13.589us 6 + _rotary_dba7d1e::apply_rotary 4.95% 39.971us 9.71% 78.412us 13.069us 64.349us 44.11% 64.349us 10.725us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.349us 44.11% 64.349us 10.725us 6 + Activity Buffer Request 29.92% 241.694us 29.92% 241.694us 241.694us 23.711us 16.25% 23.711us 23.711us 1 + aten::empty_strided 4.02% 32.509us 4.02% 32.509us 5.418us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.00% 201.945us 25.00% 201.945us 33.657us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.87% 31.225us 4.91% 39.644us 3.304us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.04% 8.419us 1.04% 8.419us 0.702us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.76% 38.441us 4.76% 38.441us 6.407us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.67% 5.380us 0.67% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.060ms -Self CUDA time total: 145.279us +Self CPU time total: 807.683us +Self CUDA time total: 145.882us @@ -4799,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 20.72% 223.838us 78.32% 845.992us 845.992us 0.000us 0.00% 747.476us 747.476us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 688.117us 101.15% 688.117us 688.117us 1 - aten::clone 2.05% 22.091us 45.23% 488.522us 81.420us 0.000us 0.00% 558.423us 93.070us 6 - aten::copy_ 3.67% 39.650us 40.20% 434.190us 72.365us 491.256us 72.21% 558.423us 93.070us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 491.256us 72.21% 491.256us 81.876us 6 - _rotary_dba7d1e::apply_rotary 4.18% 45.161us 8.45% 91.252us 15.209us 189.053us 27.79% 189.053us 31.509us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 189.053us 27.79% 189.053us 31.509us 6 - Activity Buffer Request 19.62% 211.896us 19.62% 211.896us 211.896us 67.167us 9.87% 67.167us 67.167us 1 - aten::empty_strided 2.98% 32.241us 2.98% 32.241us 5.374us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 16.91% 182.644us 16.91% 182.644us 30.441us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.05% 32.939us 3.92% 42.380us 3.532us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.87% 9.441us 0.87% 9.441us 0.787us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.27% 46.091us 4.27% 46.091us 7.682us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 21.68% 234.186us 21.68% 234.186us 234.186us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 13.54% 152.254us 71.57% 804.992us 804.992us 0.000us 0.00% 741.111us 741.111us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 682.359us 101.20% 682.359us 682.359us 1 + aten::clone 1.94% 21.788us 47.45% 533.747us 88.958us 0.000us 0.00% 557.274us 92.879us 6 + aten::copy_ 3.08% 34.611us 42.75% 480.788us 80.131us 490.426us 72.74% 557.274us 92.879us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 490.426us 72.74% 490.426us 81.738us 6 + _rotary_dba7d1e::apply_rotary 3.61% 40.571us 7.01% 78.811us 13.135us 183.837us 27.26% 183.837us 30.639us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 183.837us 27.26% 183.837us 30.639us 6 + Activity Buffer Request 21.83% 245.524us 21.83% 245.524us 245.524us 66.848us 9.91% 66.848us 66.848us 1 + aten::empty_strided 2.77% 31.171us 2.77% 31.171us 5.195us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.84% 200.653us 17.84% 200.653us 33.442us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.81% 31.570us 3.57% 40.180us 3.348us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.77% 8.610us 0.77% 8.610us 0.718us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.40% 38.240us 3.40% 38.240us 6.373us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 28.43% 319.765us 28.43% 319.765us 319.765us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.080ms -Self CUDA time total: 680.309us +Self CPU time total: 1.125ms +Self CUDA time total: 674.263us @@ -4825,33 +4607,33 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 5.41% 154.946us 27.83% 797.061us 797.061us 0.000us 0.00% 2.625ms 2.625ms 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.453ms 100.31% 2.453ms 2.453ms 1 - aten::clone 0.79% 22.601us 17.83% 510.683us 85.114us 0.000us 0.00% 1.396ms 232.586us 6 - aten::copy_ 1.43% 40.940us 15.89% 455.120us 75.853us 1.216ms 49.74% 1.396ms 232.586us 6 - _rotary_dba7d1e::apply_rotary 1.59% 45.590us 3.06% 87.640us 14.607us 1.229ms 50.26% 1.229ms 204.885us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.229ms 50.26% 1.229ms 204.885us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.216ms 49.74% 1.216ms 202.730us 6 - Activity Buffer Request 7.23% 207.076us 7.23% 207.076us 207.076us 179.136us 7.32% 179.136us 179.136us 1 - aten::empty_strided 1.15% 32.962us 1.15% 32.962us 5.494us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.23% 207.104us 7.23% 207.104us 34.517us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.15% 33.011us 1.53% 43.792us 3.649us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.38% 10.781us 0.38% 10.781us 0.898us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.47% 42.050us 1.47% 42.050us 7.008us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 72.17% 2.067ms 72.17% 2.067ms 2.067ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 5.26% 152.407us 28.24% 818.853us 818.853us 0.000us 0.00% 2.611ms 2.611ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.442ms 100.34% 2.442ms 2.442ms 1 + aten::clone 0.72% 20.941us 18.92% 548.700us 91.450us 0.000us 0.00% 1.390ms 231.619us 6 + aten::copy_ 1.19% 34.511us 17.07% 495.108us 82.518us 1.212ms 49.82% 1.390ms 231.619us 6 + _rotary_dba7d1e::apply_rotary 1.41% 40.761us 2.75% 79.892us 13.315us 1.221ms 50.18% 1.221ms 203.523us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.221ms 50.18% 1.221ms 203.523us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.212ms 49.82% 1.212ms 202.067us 6 + Activity Buffer Request 8.94% 259.144us 8.94% 259.144us 259.144us 177.311us 7.29% 177.311us 177.311us 1 + aten::empty_strided 1.13% 32.651us 1.13% 32.651us 5.442us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.95% 201.453us 6.95% 201.453us 33.575us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.03% 29.842us 1.31% 37.854us 3.154us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.28% 8.012us 0.28% 8.012us 0.668us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.35% 39.131us 1.35% 39.131us 6.522us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 71.76% 2.081ms 71.76% 2.081ms 2.081ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.864ms -Self CUDA time total: 2.446ms +Self CPU time total: 2.900ms +Self CUDA time total: 2.434ms impl wl p50(ms) ok hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True -hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True +hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True -hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.10 True -hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.10 True +hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True @@ -4862,7 +4644,7 @@ hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.85 True -hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.27 True +hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True @@ -4873,12 +4655,12 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True▶ UV Install LogsFetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 60%|██████ | 3/5 [00:00<00:00, 28.46it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.80it/s]+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.23it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.22it/s]Artifacts:
rotary.jsonl