diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.22s | Raw @@ -4123,16 +3905,16 @@ Cell: nv | 0.23s
-
Fri Oct 31 20:00:00 2025       
+
Mon Nov 10 21:57:39 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            101W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   26C    P0             88W /  350W |       0MiB /  46068MiB |     22%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4156,7 +3938,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 4.67s
+Cell: benchmark | 4.74s
  | 
 
 Raw
@@ -4227,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     426.303us      1837.51%     426.303us     426.303us             1  
-                                      hf_kernels_rotary        12.40%     260.056us        99.66%       2.090ms       2.090ms       0.000us         0.00%      24.480us      24.480us             1  
-                          _rotary_dba7d1e::apply_rotary         2.75%      57.674us         5.07%     106.315us      17.719us      16.128us        69.52%      16.128us       2.688us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        69.52%      16.128us       2.688us             6  
-                                            aten::clone         2.13%      44.582us        79.34%       1.664ms     277.309us       0.000us         0.00%       8.352us       1.392us             6  
-                                            aten::copy_         1.84%      38.562us        74.44%       1.561ms     260.165us       7.072us        30.48%       8.352us       1.392us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.072us        30.48%       7.072us       1.179us             6  
-                                Activity Buffer Request        69.01%       1.447ms        69.01%       1.447ms       1.447ms       1.280us         5.52%       1.280us       1.280us             1  
-                                    aten::empty_strided         2.78%      58.281us         2.78%      58.281us       9.713us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.58%      75.121us         3.58%      75.121us      12.520us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.14%      44.780us         2.85%      59.790us       4.983us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.72%      15.010us         0.72%      15.010us       1.251us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.32%      48.641us         2.32%      48.641us       8.107us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.34%       7.100us         0.34%       7.100us       7.100us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     403.678us      1730.44%     403.678us     403.678us             1  
+                                      hf_kernels_rotary         9.63%     231.023us        99.37%       2.384ms       2.384ms       0.000us         0.00%      24.608us      24.608us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      52.340us         4.07%      97.602us      16.267us      16.224us        69.55%      16.224us       2.704us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.224us        69.55%      16.224us       2.704us             6  
+                                            aten::clone         1.53%      36.662us        83.59%       2.005ms     334.171us       0.000us         0.00%       8.384us       1.397us             6  
+                                            aten::copy_         1.80%      43.260us        79.70%       1.912ms     318.600us       7.104us        30.45%       8.384us       1.397us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.104us        30.45%       7.104us       1.184us             6  
+                                Activity Buffer Request        74.82%       1.795ms        74.82%       1.795ms       1.795ms       1.280us         5.49%       1.280us       1.280us             1  
+                                    aten::empty_strided         2.37%      56.761us         2.37%      56.761us       9.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.07%      73.591us         3.07%      73.591us      12.265us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.65%      39.481us         2.08%      49.901us       4.158us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.43%      10.420us         0.43%      10.420us       0.868us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.89%      45.262us         1.89%      45.262us       7.544us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.63%      15.070us         0.63%      15.070us      15.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.097ms
-Self CUDA time total: 23.200us
+Self CPU time total: 2.399ms
+Self CUDA time total: 23.328us
 
 
 
@@ -4253,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     340.796us      1422.00%     340.796us     340.796us             1  
-                                      hf_kernels_rotary         9.48%     182.026us        99.73%       1.916ms       1.916ms       0.000us         0.00%      25.278us      25.278us             1  
-                          _rotary_dba7d1e::apply_rotary         2.22%      42.701us         4.40%      84.531us      14.088us      16.159us        67.42%      16.159us       2.693us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.159us        67.42%      16.159us       2.693us             6  
-                                            aten::clone         1.41%      27.120us        83.58%       1.605ms     267.570us       0.000us         0.00%       9.119us       1.520us             6  
-                                            aten::copy_         2.02%      38.773us        80.45%       1.545ms     257.555us       7.807us        32.58%       9.119us       1.520us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us        32.58%       7.807us       1.301us             6  
-                                Activity Buffer Request        75.56%       1.451ms        75.56%       1.451ms       1.451ms       1.312us         5.47%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.72%      32.970us         1.72%      32.970us       5.495us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.88%      55.291us         2.88%      55.291us       9.215us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.76%      33.749us         2.27%      43.642us       3.637us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.52%       9.893us         0.52%       9.893us       0.824us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.18%      41.830us         2.18%      41.830us       6.972us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       5.161us         0.27%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     334.494us      1388.06%     334.494us     334.494us             1  
+                                      hf_kernels_rotary         8.19%     181.152us        99.73%       2.206ms       2.206ms       0.000us         0.00%      25.410us      25.410us             1  
+                          _rotary_dba7d1e::apply_rotary         1.81%      39.991us         3.60%      79.751us      13.292us      16.193us        67.20%      16.193us       2.699us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.193us        67.20%      16.193us       2.699us             6  
+                                            aten::clone         1.33%      29.430us        86.17%       1.906ms     317.722us       0.000us         0.00%       9.217us       1.536us             6  
+                                            aten::copy_         1.70%      37.720us        83.32%       1.843ms     307.237us       7.905us        32.80%       9.217us       1.536us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        32.80%       7.905us       1.317us             6  
+                                Activity Buffer Request        79.13%       1.751ms        79.13%       1.751ms       1.751ms       1.312us         5.44%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.51%      33.481us         1.51%      33.481us       5.580us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.49%      55.161us         2.49%      55.161us       9.194us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.38%      30.530us         1.77%      39.222us       3.268us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.39%       8.692us         0.39%       8.692us       0.724us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.80%      39.760us         1.80%      39.760us       6.627us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.870us         0.27%       5.870us       5.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.921ms
-Self CUDA time total: 23.966us
+Self CPU time total: 2.212ms
+Self CUDA time total: 24.098us
 
 
 
@@ -4279,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.421us      1391.81%     339.421us     339.421us             1  
-                                      hf_kernels_rotary         9.18%     172.926us        99.76%       1.879ms       1.879ms       0.000us         0.00%      25.699us      25.699us             1  
-                          _rotary_dba7d1e::apply_rotary         2.20%      41.409us         4.51%      85.000us      14.167us      16.481us        67.58%      16.481us       2.747us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.481us        67.58%      16.481us       2.747us             6  
-                                            aten::clone         1.46%      27.581us        83.73%       1.577ms     262.862us       0.000us         0.00%       9.218us       1.536us             6  
-                                            aten::copy_         1.97%      37.091us        80.45%       1.515ms     252.563us       7.906us        32.42%       9.218us       1.536us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.906us        32.42%       7.906us       1.318us             6  
-                                Activity Buffer Request        75.71%       1.426ms        75.71%       1.426ms       1.426ms       1.312us         5.38%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.82%      34.210us         1.82%      34.210us       5.702us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.77%      52.231us         2.77%      52.231us       8.705us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.80%      33.892us         2.33%      43.952us       3.663us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.53%      10.060us         0.53%      10.060us       0.838us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.31%      43.591us         2.31%      43.591us       7.265us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.550us         0.24%       4.550us       4.550us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     333.020us      1374.81%     333.020us     333.020us             1  
+                                      hf_kernels_rotary         8.22%     183.662us        99.77%       2.229ms       2.229ms       0.000us         0.00%      25.535us      25.535us             1  
+                          _rotary_dba7d1e::apply_rotary         1.78%      39.771us         3.54%      79.142us      13.190us      16.479us        68.03%      16.479us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.479us        68.03%      16.479us       2.747us             6  
+                                            aten::clone         1.23%      27.502us        86.14%       1.925ms     320.808us       0.000us         0.00%       9.056us       1.509us             6  
+                                            aten::copy_         1.51%      33.780us        83.43%       1.864ms     310.723us       7.744us        31.97%       9.056us       1.509us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        31.97%       7.744us       1.291us             6  
+                                Activity Buffer Request        79.60%       1.779ms        79.60%       1.779ms       1.779ms       1.312us         5.42%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.48%      33.009us         1.48%      33.009us       5.501us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.32%      51.921us         2.32%      51.921us       8.654us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.44%      32.260us         1.87%      41.742us       3.478us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.42%       9.482us         0.42%       9.482us       0.790us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.76%      39.371us         1.76%      39.371us       6.562us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       5.150us         0.23%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.884ms
-Self CUDA time total: 24.387us
+Self CPU time total: 2.235ms
+Self CUDA time total: 24.223us
 
 
 
@@ -4305,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.466us      1252.36%     353.466us     353.466us             1  
-                                      hf_kernels_rotary         8.35%     176.747us        99.76%       2.111ms       2.111ms       0.000us         0.00%      30.048us      30.048us             1  
-                          _rotary_dba7d1e::apply_rotary         2.17%      45.850us         4.21%      89.000us      14.833us      17.664us        62.59%      17.664us       2.944us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        62.59%      17.664us       2.944us             6  
-                                            aten::clone         1.36%      28.714us        85.13%       1.802ms     300.274us       0.000us         0.00%      12.384us       2.064us             6  
-                                            aten::copy_         1.83%      38.751us        82.20%       1.740ms     289.944us      10.560us        37.41%      12.384us       2.064us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        37.41%      10.560us       1.760us             6  
-                                Activity Buffer Request        67.60%       1.431ms        67.60%       1.431ms       1.431ms       1.824us         6.46%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.57%      33.269us         1.57%      33.269us       5.545us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.77%     270.306us        12.77%     270.306us      45.051us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.59%      33.568us         2.07%      43.911us       3.659us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.49%      10.343us         0.49%      10.343us       0.862us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.04%      43.150us         2.04%      43.150us       7.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.130us         0.24%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.396us      1170.66%     330.396us     330.396us             1  
+                                      hf_kernels_rotary        19.88%     180.354us        99.43%     901.975us     901.975us       0.000us         0.00%      29.983us      29.983us             1  
+                          _rotary_dba7d1e::apply_rotary         4.33%      39.273us         8.60%      78.013us      13.002us      17.759us        62.92%      17.759us       2.960us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.759us        62.92%      17.759us       2.960us             6  
+                                            aten::clone         2.43%      22.040us        66.64%     604.579us     100.763us       0.000us         0.00%      12.224us       2.037us             6  
+                                            aten::copy_         3.81%      34.600us        60.79%     551.459us      91.910us      10.464us        37.08%      12.224us       2.037us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us        37.08%      10.464us       1.744us             6  
+                                Activity Buffer Request        27.63%     250.684us        27.63%     250.684us     250.684us       1.760us         6.24%       1.760us       1.760us             1  
+                                    aten::empty_strided         3.43%      31.080us         3.43%      31.080us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        29.34%     266.175us        29.34%     266.175us      44.362us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.36%      30.489us         4.30%      39.029us       3.252us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.94%       8.540us         0.94%       8.540us       0.712us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.27%      38.740us         4.27%      38.740us       6.457us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       5.209us         0.57%       5.209us       5.209us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.116ms
-Self CUDA time total: 28.224us
+Self CPU time total: 907.184us
+Self CUDA time total: 28.223us
 
 
 
@@ -4331,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.740us      1444.46%     351.740us     351.740us             1  
-                                      hf_kernels_rotary         8.68%     176.155us        99.77%       2.024ms       2.024ms       0.000us         0.00%      25.663us      25.663us             1  
-                          _rotary_dba7d1e::apply_rotary         2.27%      46.099us         4.32%      87.680us      14.613us      16.479us        67.67%      16.479us       2.747us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.479us        67.67%      16.479us       2.747us             6  
-                                            aten::clone         1.42%      28.832us        84.62%       1.717ms     286.091us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         1.86%      37.831us        81.49%       1.653ms     275.519us       7.872us        32.33%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        32.33%       7.872us       1.312us             6  
-                                Activity Buffer Request        70.03%       1.420ms        70.03%       1.420ms       1.420ms       1.312us         5.39%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.71%      34.601us         1.71%      34.601us       5.767us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.60%     194.784us         9.60%     194.784us      32.464us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.63%      33.102us         2.14%      43.512us       3.626us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.51%      10.410us         0.51%      10.410us       0.867us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.05%      41.581us         2.05%      41.581us       6.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.660us         0.23%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     333.950us      1371.35%     333.950us     333.950us             1  
+                                      hf_kernels_rotary         7.53%     182.915us        99.79%       2.425ms       2.425ms       0.000us         0.00%      25.664us      25.664us             1  
+                          _rotary_dba7d1e::apply_rotary         1.65%      40.000us         3.26%      79.130us      13.188us      16.545us        67.94%      16.545us       2.758us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.545us        67.94%      16.545us       2.758us             6  
+                                            aten::clone         1.26%      30.642us        87.34%       2.122ms     353.721us       0.000us         0.00%       9.119us       1.520us             6  
+                                            aten::copy_         1.47%      35.799us        84.75%       2.059ms     343.229us       7.807us        32.06%       9.119us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us        32.06%       7.807us       1.301us             6  
+                                Activity Buffer Request        73.06%       1.775ms        73.06%       1.775ms       1.775ms       1.312us         5.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.33%      32.310us         1.33%      32.310us       5.385us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.22%     248.434us        10.22%     248.434us      41.406us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.31%      31.720us         1.66%      40.370us       3.364us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.650us         0.36%       8.650us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.61%      39.130us         1.61%      39.130us       6.522us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.21%       5.100us         0.21%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.029ms
-Self CUDA time total: 24.351us
+Self CPU time total: 2.430ms
+Self CUDA time total: 24.352us
 
 
 
@@ -4357,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.111us      1238.38%     349.111us     349.111us             1  
-                                      hf_kernels_rotary        23.24%     192.013us        99.32%     820.571us     820.571us       0.000us         0.00%      30.015us      30.015us             1  
-                          _rotary_dba7d1e::apply_rotary         5.42%      44.795us        10.63%      87.866us      14.644us      17.632us        62.54%      17.632us       2.939us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.632us        62.54%      17.632us       2.939us             6  
-                                            aten::clone         2.69%      22.223us        60.09%     496.442us      82.740us       0.000us         0.00%      12.383us       2.064us             6  
-                                            aten::copy_         4.60%      38.000us        53.48%     441.890us      73.648us      10.559us        37.46%      12.383us       2.064us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.559us        37.46%      10.559us       1.760us             6  
-                                Activity Buffer Request        26.48%     218.816us        26.48%     218.816us     218.816us       1.824us         6.47%       1.824us       1.824us             1  
-                                    aten::empty_strided         3.91%      32.329us         3.91%      32.329us       5.388us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.40%     185.074us        22.40%     185.074us      30.846us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.04%      33.410us         5.36%      44.250us       3.688us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.31%      10.840us         1.31%      10.840us       0.903us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.21%      43.071us         5.21%      43.071us       7.178us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.68%       5.641us         0.68%       5.641us       5.641us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.717us      1169.19%     330.717us     330.717us             1  
+                                      hf_kernels_rotary         7.60%     182.573us        99.80%       2.396ms       2.396ms       0.000us         0.00%      30.046us      30.046us             1  
+                          _rotary_dba7d1e::apply_rotary         1.66%      39.960us         3.28%      78.811us      13.135us      17.758us        62.78%      17.758us       2.960us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.758us        62.78%      17.758us       2.960us             6  
+                                            aten::clone         1.18%      28.252us        87.25%       2.095ms     349.108us       0.000us         0.00%      12.288us       2.048us             6  
+                                            aten::copy_         1.56%      37.480us        84.78%       2.035ms     339.209us      10.528us        37.22%      12.288us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        37.22%      10.528us       1.755us             6  
+                                Activity Buffer Request        73.02%       1.753ms        73.02%       1.753ms       1.753ms       1.760us         6.22%       1.760us       1.760us             1  
+                                    aten::empty_strided         1.30%      31.140us         1.30%      31.140us       5.190us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.19%     244.675us        10.19%     244.675us      40.779us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.30%      31.158us         1.66%      39.899us       3.325us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.741us         0.36%       8.741us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.62%      38.851us         1.62%      38.851us       6.475us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.20%       4.770us         0.20%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 826.212us
-Self CUDA time total: 28.191us
+Self CPU time total: 2.401ms
+Self CUDA time total: 28.286us
 
 
 
@@ -4383,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.984us       852.93%     344.984us     344.984us             1  
-                                      hf_kernels_rotary        22.02%     168.975us        99.39%     762.759us     762.759us       0.000us         0.00%      43.263us      43.263us             1  
-                          _rotary_dba7d1e::apply_rotary         5.75%      44.162us        11.18%      85.802us      14.300us      23.456us        57.99%      23.456us       3.909us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.456us        57.99%      23.456us       3.909us             6  
-                                            aten::clone         2.91%      22.350us        60.45%     463.932us      77.322us       0.000us         0.00%      19.807us       3.301us             6  
-                                            aten::copy_         4.98%      38.249us        53.45%     410.170us      68.362us      16.991us        42.01%      19.807us       3.301us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.991us        42.01%      16.991us       2.832us             6  
-                                Activity Buffer Request        24.55%     188.395us        24.55%     188.395us     188.395us       2.816us         6.96%       2.816us       2.816us             1  
-                                    aten::empty_strided         4.09%      31.412us         4.09%      31.412us       5.235us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.91%     183.526us        23.91%     183.526us      30.588us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.40%      33.790us         5.74%      44.050us       3.671us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.34%      10.260us         1.34%      10.260us       0.855us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.43%      41.640us         5.43%      41.640us       6.940us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.61%       4.661us         0.61%       4.661us       4.661us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     331.263us       811.96%     331.263us     331.263us             1  
+                                      hf_kernels_rotary         7.62%     179.163us        99.79%       2.346ms       2.346ms       0.000us         0.00%      43.646us      43.646us             1  
+                          _rotary_dba7d1e::apply_rotary         1.67%      39.309us         3.29%      77.411us      12.902us      23.680us        58.04%      23.680us       3.947us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.680us        58.04%      23.680us       3.947us             6  
+                                            aten::clone         1.17%      27.469us        87.14%       2.049ms     341.486us       0.000us         0.00%      19.966us       3.328us             6  
+                                            aten::copy_         1.49%      35.141us        84.62%       1.990ms     331.589us      17.118us        41.96%      19.966us       3.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.118us        41.96%      17.118us       2.853us             6  
+                                Activity Buffer Request        73.01%       1.717ms        73.01%       1.717ms       1.717ms       2.848us         6.98%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.36%      31.912us         1.36%      31.912us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.11%     237.764us        10.11%     237.764us      39.627us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.35%      31.810us         1.74%      40.800us       3.400us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.38%       8.990us         0.38%       8.990us       0.749us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.62%      38.102us         1.62%      38.102us       6.350us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.21%       4.871us         0.21%       4.871us       4.871us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 767.420us
-Self CUDA time total: 40.447us
+Self CPU time total: 2.351ms
+Self CUDA time total: 40.798us
 
 
 
@@ -4409,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.453us       442.64%     347.453us     347.453us             1  
-                                      hf_kernels_rotary        20.37%     160.826us        99.39%     784.751us     784.751us       0.000us         0.00%      91.040us      91.040us             1  
-                                            aten::clone         2.83%      22.340us        62.44%     492.983us      82.164us       0.000us         0.00%      52.865us       8.811us             6  
-                                            aten::copy_         4.65%      36.740us        55.30%     436.663us      72.777us      40.321us        51.37%      52.865us       8.811us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      40.321us        51.37%      40.321us       6.720us             6  
-                          _rotary_dba7d1e::apply_rotary         5.74%      45.350us        11.00%      86.891us      14.482us      38.175us        48.63%      38.175us       6.362us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.175us        48.63%      38.175us       6.362us             6  
-                                Activity Buffer Request        27.86%     219.946us        27.86%     219.946us     219.946us      12.544us        15.98%      12.544us      12.544us             1  
-                                    aten::empty_strided         4.30%      33.980us         4.30%      33.980us       5.663us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.79%     179.977us        22.79%     179.977us      29.996us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.35%      34.361us         5.58%      44.051us       3.671us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.23%       9.690us         1.23%       9.690us       0.808us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.26%      41.541us         5.26%      41.541us       6.924us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.61%       4.830us         0.61%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     336.387us       451.94%     336.387us     336.387us             1  
+                                      hf_kernels_rotary         7.84%     184.420us        99.78%       2.346ms       2.346ms       0.000us         0.00%      82.976us      82.976us             1  
+                                            aten::clone         1.21%      28.560us        86.97%       2.045ms     340.779us       0.000us         0.00%      43.553us       7.259us             6  
+                                            aten::copy_         1.54%      36.092us        84.34%       1.983ms     330.495us      35.009us        47.03%      43.553us       7.259us             6  
+                          _rotary_dba7d1e::apply_rotary         1.67%      39.331us         3.28%      77.091us      12.849us      39.423us        52.97%      39.423us       6.571us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.423us        52.97%      39.423us       6.571us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.009us        47.03%      35.009us       5.835us             6  
+                                Activity Buffer Request        73.02%       1.717ms        73.02%       1.717ms       1.717ms       8.544us        11.48%       8.544us       8.544us             1  
+                                    aten::empty_strided         1.41%      33.141us         1.41%      33.141us       5.523us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.79%     230.064us         9.79%     230.064us      38.344us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.34%      31.492us         1.69%      39.832us       3.319us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.35%       8.340us         0.35%       8.340us       0.695us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.61%      37.760us         1.61%      37.760us       6.293us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       5.070us         0.22%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 789.581us
-Self CUDA time total: 78.496us
+Self CPU time total: 2.351ms
+Self CUDA time total: 74.432us
 
 
 
@@ -4435,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.324us       858.06%     347.324us     347.324us             1  
-                                      hf_kernels_rotary         8.65%     173.958us        99.77%       2.007ms       2.007ms       0.000us         0.00%      43.325us      43.325us             1  
-                          _rotary_dba7d1e::apply_rotary         2.18%      43.910us         4.21%      84.770us      14.128us      23.423us        57.87%      23.423us       3.904us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.423us        57.87%      23.423us       3.904us             6  
-                                            aten::clone         1.35%      27.211us        84.83%       1.706ms     284.405us       0.000us         0.00%      19.902us       3.317us             6  
-                                            aten::copy_         1.92%      38.681us        81.76%       1.645ms     274.138us      17.055us        42.13%      19.902us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.055us        42.13%      17.055us       2.842us             6  
-                                Activity Buffer Request        70.68%       1.422ms        70.68%       1.422ms       1.422ms       2.847us         7.03%       2.847us       2.847us             1  
-                                    aten::empty_strided         1.71%      34.392us         1.71%      34.392us       5.732us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.16%     184.363us         9.16%     184.363us      30.727us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.62%      32.593us         2.08%      41.861us       3.488us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.46%       9.268us         0.46%       9.268us       0.772us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.03%      40.860us         2.03%      40.860us       6.810us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.670us         0.23%       4.670us       4.670us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     334.720us       824.27%     334.720us     334.720us             1  
+                                      hf_kernels_rotary         7.69%     178.052us        99.76%       2.310ms       2.310ms       0.000us         0.00%      43.488us      43.488us             1  
+                          _rotary_dba7d1e::apply_rotary         1.77%      40.921us         3.42%      79.272us      13.212us      23.680us        58.31%      23.680us       3.947us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.680us        58.31%      23.680us       3.947us             6  
+                                            aten::clone         1.23%      28.463us        86.92%       2.013ms     335.521us       0.000us         0.00%      19.808us       3.301us             6  
+                                            aten::copy_         1.52%      35.247us        84.34%       1.953ms     325.533us      16.928us        41.69%      19.808us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        41.69%      16.928us       2.821us             6  
+                                Activity Buffer Request        73.01%       1.691ms        73.01%       1.691ms       1.691ms       2.880us         7.09%       2.880us       2.880us             1  
+                                    aten::empty_strided         1.36%      31.460us         1.36%      31.460us       5.243us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.81%     227.126us         9.81%     227.126us      37.854us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.37%      31.801us         1.73%      40.020us       3.335us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.35%       8.219us         0.35%       8.219us       0.685us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.66%      38.351us         1.66%      38.351us       6.392us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.500us         0.24%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.012ms
-Self CUDA time total: 40.478us
+Self CPU time total: 2.316ms
+Self CUDA time total: 40.608us
 
 
 
@@ -4461,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.785us       476.45%     361.785us     361.785us             1  
-                                      hf_kernels_rotary         8.64%     176.662us        99.77%       2.040ms       2.040ms       0.000us         0.00%      86.685us      86.685us             1  
-                                            aten::clone         1.40%      28.682us        84.64%       1.731ms     288.486us       0.000us         0.00%      47.871us       7.979us             6  
-                                            aten::copy_         1.80%      36.737us        81.55%       1.668ms     277.962us      37.119us        48.88%      47.871us       7.979us             6  
-                          _rotary_dba7d1e::apply_rotary         2.24%      45.910us         4.34%      88.820us      14.803us      38.814us        51.12%      38.814us       6.469us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.814us        51.12%      38.814us       6.469us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.119us        48.88%      37.119us       6.187us             6  
-                                Activity Buffer Request        70.82%       1.448ms        70.82%       1.448ms       1.448ms      10.752us        14.16%      10.752us      10.752us             1  
-                                    aten::empty_strided         1.69%      34.462us         1.69%      34.462us       5.744us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.93%     182.677us         8.93%     182.677us      30.446us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.66%      33.994us         2.15%      43.925us       3.660us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.49%       9.931us         0.49%       9.931us       0.828us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.10%      42.910us         2.10%      42.910us       7.152us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.670us         0.23%       4.670us       4.670us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     343.357us       451.99%     343.357us     343.357us             1  
+                                      hf_kernels_rotary         7.23%     182.803us        99.81%       2.522ms       2.522ms       0.000us         0.00%      85.341us      85.341us             1  
+                                            aten::clone         1.16%      29.441us        87.88%       2.221ms     370.131us       0.000us         0.00%      46.013us       7.669us             6  
+                                            aten::copy_         1.42%      35.932us        85.39%       2.158ms     359.654us      36.637us        48.23%      46.013us       7.669us             6  
+                          _rotary_dba7d1e::apply_rotary         1.58%      39.950us         3.09%      78.111us      13.018us      39.328us        51.77%      39.328us       6.555us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.328us        51.77%      39.328us       6.555us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      36.637us        48.23%      36.637us       6.106us             6  
+                                Activity Buffer Request        75.16%       1.899ms        75.16%       1.899ms       1.899ms       9.376us        12.34%       9.376us       9.376us             1  
+                                    aten::empty_strided         1.32%      33.420us         1.32%      33.420us       5.570us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.81%     222.633us         8.81%     222.633us      37.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.25%      31.613us         1.61%      40.701us       3.392us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       9.088us         0.36%       9.088us       0.757us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.51%      38.161us         1.51%      38.161us       6.360us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.19%       4.790us         0.19%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.045ms
-Self CUDA time total: 75.933us
+Self CPU time total: 2.527ms
+Self CUDA time total: 75.965us
 
 
 
@@ -4487,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     373.629us       268.97%     373.629us     373.629us             1  
-                                      hf_kernels_rotary         8.95%     179.578us        99.78%       2.002ms       2.002ms       0.000us         0.00%     162.750us     162.750us             1  
-                                            aten::clone         1.48%      29.597us        83.94%       1.684ms     280.680us       0.000us         0.00%     102.944us      17.157us             6  
-                                            aten::copy_         1.82%      36.553us        80.73%       1.620ms     269.962us      79.104us        56.95%     102.944us      17.157us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      79.104us        56.95%      79.104us      13.184us             6  
-                          _rotary_dba7d1e::apply_rotary         2.30%      46.131us         4.57%      91.713us      15.285us      59.806us        43.05%      59.806us       9.968us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      59.806us        43.05%      59.806us       9.968us             6  
-                                Activity Buffer Request        69.91%       1.403ms        69.91%       1.403ms       1.403ms      23.840us        17.16%      23.840us      23.840us             1  
-                                    aten::empty_strided         1.73%      34.712us         1.73%      34.712us       5.785us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.00%     180.563us         9.00%     180.563us      30.094us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.75%      35.198us         2.31%      46.409us       3.867us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.56%      11.211us         0.56%      11.211us       0.934us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.27%      45.582us         2.27%      45.582us       7.597us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       4.510us         0.22%       4.510us       4.510us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.487us       241.29%     335.487us     335.487us             1  
+                                      hf_kernels_rotary         7.48%     174.562us        99.79%       2.329ms       2.329ms       0.000us         0.00%     162.718us     162.718us             1  
+                                            aten::clone         1.24%      29.010us        87.24%       2.036ms     339.299us       0.000us         0.00%     102.494us      17.082us             6  
+                                            aten::copy_         1.51%      35.312us        84.60%       1.974ms     329.037us      78.815us        56.69%     102.494us      17.082us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.815us        56.69%      78.815us      13.136us             6  
+                          _rotary_dba7d1e::apply_rotary         1.71%      39.800us         3.37%      78.741us      13.124us      60.224us        43.31%      60.224us      10.037us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      60.224us        43.31%      60.224us      10.037us             6  
+                                Activity Buffer Request        73.92%       1.725ms        73.92%       1.725ms       1.725ms      23.679us        17.03%      23.679us      23.679us             1  
+                                    aten::empty_strided         1.40%      32.561us         1.40%      32.561us       5.427us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.17%     213.963us         9.17%     213.963us      35.660us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.33%      31.050us         1.69%      39.471us       3.289us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.421us         0.36%       8.421us       0.702us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.67%      38.941us         1.67%      38.941us       6.490us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.21%       4.971us         0.21%       4.971us       4.971us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.006ms
-Self CUDA time total: 138.910us
+Self CPU time total: 2.334ms
+Self CUDA time total: 139.039us
 
 
 
@@ -4513,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         7.56%     177.196us        86.68%       2.032ms       2.032ms       0.000us         0.00%     778.402us     778.402us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     717.248us       101.07%     717.248us     717.248us             1  
-                                            aten::clone         1.23%      28.772us        72.98%       1.711ms     285.141us       0.000us         0.00%     578.626us      96.438us             6  
-                                            aten::copy_         1.64%      38.341us        70.23%       1.646ms     274.415us     509.889us        71.85%     578.626us      96.438us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     509.889us        71.85%     509.889us      84.982us             6  
-                          _rotary_dba7d1e::apply_rotary         2.34%      54.801us         4.25%      99.591us      16.598us     199.776us        28.15%     199.776us      33.296us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     199.776us        28.15%     199.776us      33.296us             6  
-                                Activity Buffer Request        60.86%       1.427ms        60.86%       1.427ms       1.427ms      68.737us         9.69%      68.737us      68.737us             1  
-                                    aten::empty_strided         1.52%      35.581us         1.52%      35.581us       5.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.74%     181.435us         7.74%     181.435us      30.239us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.41%      33.151us         1.89%      44.330us       3.694us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%      11.179us         0.48%      11.179us       0.932us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.91%      44.790us         1.91%      44.790us       7.465us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        13.32%     312.348us        13.32%     312.348us     312.348us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        13.11%     152.482us        70.07%     814.833us     814.833us       0.000us         0.00%     767.862us     767.862us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     709.398us       101.13%     709.398us     709.398us             1  
+                                            aten::clone         1.92%      22.371us        46.79%     544.150us      90.692us       0.000us         0.00%     567.671us      94.612us             6  
+                                            aten::copy_         3.06%      35.584us        42.24%     491.229us      81.872us     501.304us        71.46%     567.671us      94.612us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     501.304us        71.46%     501.304us      83.551us             6  
+                          _rotary_dba7d1e::apply_rotary         3.52%      40.960us         6.87%      79.901us      13.317us     200.191us        28.54%     200.191us      33.365us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     200.191us        28.54%     200.191us      33.365us             6  
+                                Activity Buffer Request        20.99%     244.144us        20.99%     244.144us     244.144us      66.367us         9.46%      66.367us      66.367us             1  
+                                    aten::empty_strided         2.63%      30.550us         2.63%      30.550us       5.092us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.19%     211.501us        18.19%     211.501us      35.250us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.57%      29.881us         3.29%      38.300us       3.192us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.72%       8.419us         0.72%       8.419us       0.702us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.35%      38.941us         3.35%      38.941us       6.490us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        29.93%     348.096us        29.93%     348.096us     348.096us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.344ms
-Self CUDA time total: 709.665us
+Self CPU time total: 1.163ms
+Self CUDA time total: 701.495us
 
 
 
@@ -4539,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.629us      1313.11%     349.629us     349.629us             1  
-                                      hf_kernels_rotary         8.75%     174.875us        99.76%       1.994ms       1.994ms       0.000us         0.00%      27.938us      27.938us             1  
-                          _rotary_dba7d1e::apply_rotary         2.16%      43.200us         4.40%      87.900us      14.650us      18.754us        70.43%      18.754us       3.126us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.754us        70.43%      18.754us       3.126us             6  
-                                            aten::clone         1.44%      28.720us        84.48%       1.688ms     281.365us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         1.82%      36.432us        81.36%       1.626ms     271.003us       7.872us        29.57%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        29.57%       7.872us       1.312us             6  
-                                Activity Buffer Request        70.53%       1.410ms        70.53%       1.410ms       1.410ms       1.312us         4.93%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.67%      33.452us         1.67%      33.452us       5.575us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.01%     180.083us         9.01%     180.083us      30.014us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.63%      32.560us         2.14%      42.684us       3.557us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.51%      10.124us         0.51%      10.124us       0.844us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.24%      44.700us         2.24%      44.700us       7.450us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.780us         0.24%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     326.016us      1225.99%     326.016us     326.016us             1  
+                                      hf_kernels_rotary        18.50%     152.323us        99.40%     818.663us     818.663us       0.000us         0.00%      27.904us      27.904us             1  
+                          _rotary_dba7d1e::apply_rotary         4.86%      40.039us         9.57%      78.850us      13.142us      18.752us        70.52%      18.752us       3.125us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.752us        70.52%      18.752us       3.125us             6  
+                                            aten::clone         2.56%      21.061us        66.62%     548.640us      91.440us       0.000us         0.00%       9.152us       1.525us             6  
+                                            aten::copy_         4.19%      34.519us        60.27%     496.387us      82.731us       7.840us        29.48%       9.152us       1.525us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        29.48%       7.840us       1.307us             6  
+                                Activity Buffer Request        29.97%     246.784us        29.97%     246.784us     246.784us       1.312us         4.93%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.79%      31.192us         3.79%      31.192us       5.199us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.12%     215.084us        26.12%     215.084us      35.847us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.71%      30.531us         4.72%      38.850us       3.237us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.01%       8.319us         1.01%       8.319us       0.693us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.71%      38.811us         4.71%      38.811us       6.469us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.60%       4.910us         0.60%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.998ms
-Self CUDA time total: 26.626us
+Self CPU time total: 823.573us
+Self CUDA time total: 26.592us
 
 
 
@@ -4565,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.698us      1282.22%     344.698us     344.698us             1  
-                                      hf_kernels_rotary        22.61%     152.757us        99.23%     670.538us     670.538us       0.000us         0.00%      28.195us      28.195us             1  
-                          _rotary_dba7d1e::apply_rotary         6.64%      44.870us        12.97%      87.630us      14.605us      19.009us        70.71%      19.009us       3.168us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.009us        70.71%      19.009us       3.168us             6  
-                                            aten::clone         3.38%      22.839us        57.25%     386.869us      64.478us       0.000us         0.00%       9.186us       1.531us             6  
-                                            aten::copy_         5.63%      38.041us        49.11%     331.829us      55.305us       7.874us        29.29%       9.186us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.874us        29.29%       7.874us       1.312us             6  
-                                Activity Buffer Request        16.48%     111.363us        16.48%     111.363us     111.363us       1.312us         4.88%       1.312us       1.312us             1  
-                                    aten::empty_strided         4.77%      32.201us         4.77%      32.201us       5.367us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        27.00%     182.425us        27.00%     182.425us      30.404us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.90%      33.085us         6.41%      43.282us       3.607us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.51%      10.197us         1.51%      10.197us       0.850us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         6.33%      42.760us         6.33%      42.760us       7.127us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.77%       5.200us         0.77%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.263us      1209.82%     323.263us     323.263us             1  
+                                      hf_kernels_rotary        17.52%     147.623us        99.42%     837.623us     837.623us       0.000us         0.00%      28.032us      28.032us             1  
+                          _rotary_dba7d1e::apply_rotary         4.62%      38.930us         9.25%      77.941us      12.990us      18.944us        70.90%      18.944us       3.157us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.944us        70.90%      18.944us       3.157us             6  
+                                            aten::clone         2.83%      23.880us        68.02%     573.009us      95.502us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         4.05%      34.160us        61.53%     518.397us      86.400us       7.776us        29.10%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.10%       7.776us       1.296us             6  
+                                Activity Buffer Request        32.41%     273.024us        32.41%     273.024us     273.024us       1.312us         4.91%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.65%      30.732us         3.65%      30.732us       5.122us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.07%     211.213us        25.07%     211.213us      35.202us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.65%      30.720us         4.64%      39.050us       3.254us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       8.330us         0.99%       8.330us       0.694us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.63%      39.011us         4.63%      39.011us       6.502us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.850us         0.58%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 675.738us
-Self CUDA time total: 26.883us
+Self CPU time total: 842.473us
+Self CUDA time total: 26.720us
 
 
 
@@ -4591,22 +4373,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.004us      1141.75%     350.004us     350.004us             1  
-                                      hf_kernels_rotary        19.05%     154.214us        99.36%     804.261us     804.261us       0.000us         0.00%      32.414us      32.414us             1  
-                          _rotary_dba7d1e::apply_rotary         5.47%      44.240us        10.98%      88.910us      14.818us      20.064us        65.45%      20.064us       3.344us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.064us        65.45%      20.064us       3.344us             6  
-                                            aten::clone         3.02%      24.421us        63.80%     516.433us      86.072us       0.000us         0.00%      12.350us       2.058us             6  
-                                            aten::copy_         4.66%      37.732us        56.69%     458.901us      76.483us      10.591us        34.55%      12.350us       2.058us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.591us        34.55%      10.591us       1.765us             6  
-                                Activity Buffer Request        29.69%     240.306us        29.69%     240.306us     240.306us       1.759us         5.74%       1.759us       1.759us             1  
-                                    aten::empty_strided         4.09%      33.111us         4.09%      33.111us       5.518us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.34%     180.863us        22.34%     180.863us      30.144us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.15%      33.594us         5.52%      44.704us       3.725us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.37%      11.110us         1.37%      11.110us       0.926us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.52%      44.670us         5.52%      44.670us       7.445us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.64%       5.201us         0.64%       5.201us       5.201us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     317.947us      1037.18%     317.947us     317.947us             1  
+                                      hf_kernels_rotary        18.00%     147.321us        99.35%     812.963us     812.963us       0.000us         0.00%      32.383us      32.383us             1  
+                          _rotary_dba7d1e::apply_rotary         4.88%      39.901us         9.44%      77.251us      12.875us      20.255us        66.07%      20.255us       3.376us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.255us        66.07%      20.255us       3.376us             6  
+                                            aten::clone         2.41%      19.693us        67.19%     549.781us      91.630us       0.000us         0.00%      12.128us       2.021us             6  
+                                            aten::copy_         4.28%      35.023us        61.13%     500.160us      83.360us      10.400us        33.93%      12.128us       2.021us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        33.93%      10.400us       1.733us             6  
+                                Activity Buffer Request        31.00%     253.664us        31.00%     253.664us     253.664us       1.728us         5.64%       1.728us       1.728us             1  
+                                    aten::empty_strided         3.66%      29.928us         3.66%      29.928us       4.988us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.84%     211.473us        25.84%     211.473us      35.245us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.72%      30.411us         4.72%      38.610us       3.218us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       8.199us         1.00%       8.199us       0.683us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.56%      37.350us         4.56%      37.350us       6.225us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%       5.289us         0.65%       5.289us       5.289us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 809.462us
+Self CPU time total: 818.252us
 Self CUDA time total: 30.655us
 
 
@@ -4617,23 +4399,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.355us       822.64%     350.355us     350.355us             1  
-                                      hf_kernels_rotary        19.55%     155.605us        99.35%     790.981us     790.981us       0.000us         0.00%      45.469us      45.469us             1  
-                          _rotary_dba7d1e::apply_rotary         5.55%      44.191us        11.02%      87.731us      14.622us      25.565us        60.03%      25.565us       4.261us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.565us        60.03%      25.565us       4.261us             6  
-                                            aten::clone         2.81%      22.389us        63.13%     502.593us      83.766us       0.000us         0.00%      19.904us       3.317us             6  
-                                            aten::copy_         4.90%      39.043us        56.13%     446.833us      74.472us      17.024us        39.97%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        39.97%      17.024us       2.837us             6  
-                                Activity Buffer Request        28.37%     225.886us        28.37%     225.886us     225.886us       2.880us         6.76%       2.880us       2.880us             1  
-                                    aten::empty_strided         4.19%      33.371us         4.19%      33.371us       5.562us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.85%     181.904us        22.85%     181.904us      30.317us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.29%      34.142us         5.66%      45.052us       3.754us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.37%      10.910us         1.37%      10.910us       0.909us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.47%      43.540us         5.47%      43.540us       7.257us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.140us         0.65%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     331.769us       777.76%     331.769us     331.769us             1  
+                                      hf_kernels_rotary        19.70%     168.549us        99.44%     850.864us     850.864us       0.000us         0.00%      45.537us      45.537us             1  
+                          _rotary_dba7d1e::apply_rotary         4.73%      40.431us         9.19%      78.662us      13.110us      25.697us        60.24%      25.697us       4.283us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.697us        60.24%      25.697us       4.283us             6  
+                                            aten::clone         2.97%      25.433us        65.78%     562.881us      93.814us       0.000us         0.00%      19.840us       3.307us             6  
+                                            aten::copy_         4.23%      36.170us        59.14%     506.068us      84.345us      16.960us        39.76%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        39.76%      16.960us       2.827us             6  
+                                Activity Buffer Request        30.43%     260.334us        30.43%     260.334us     260.334us       2.880us         6.75%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.67%      31.380us         3.67%      31.380us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.49%     209.564us        24.49%     209.564us      34.927us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.75%      32.092us         4.77%      40.772us       3.398us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.01%       8.680us         1.01%       8.680us       0.723us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.47%      38.231us         4.47%      38.231us       6.372us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.56%       4.789us         0.56%       4.789us       4.789us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 796.121us
-Self CUDA time total: 42.589us
+Self CPU time total: 855.653us
+Self CUDA time total: 42.657us
 
 
 
@@ -4643,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.951us      1133.59%     344.951us     344.951us             1  
-                                      hf_kernels_rotary        19.05%     153.418us        99.42%     800.680us     800.680us       0.000us         0.00%      32.125us      32.125us             1  
-                          _rotary_dba7d1e::apply_rotary         5.43%      43.718us        10.83%      87.180us      14.530us      20.095us        66.04%      20.095us       3.349us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.095us        66.04%      20.095us       3.349us             6  
-                                            aten::clone         2.75%      22.180us        64.20%     517.012us      86.169us       0.000us         0.00%      12.030us       2.005us             6  
-                                            aten::copy_         4.82%      38.813us        57.22%     460.802us      76.800us      10.335us        33.96%      12.030us       2.005us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.335us        33.96%      10.335us       1.722us             6  
-                                Activity Buffer Request        30.13%     242.666us        30.13%     242.666us     242.666us       1.695us         5.57%       1.695us       1.695us             1  
-                                    aten::empty_strided         4.23%      34.030us         4.23%      34.030us       5.672us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.27%     179.323us        22.27%     179.323us      29.887us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.11%      33.131us         5.35%      43.070us       3.589us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.23%       9.939us         1.23%       9.939us       0.828us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.40%      43.462us         5.40%      43.462us       7.244us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.58%       4.660us         0.58%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     324.568us      1058.74%     324.568us     324.568us             1  
+                                      hf_kernels_rotary        19.85%     169.202us        99.36%     847.094us     847.094us       0.000us         0.00%      32.384us      32.384us             1  
+                          _rotary_dba7d1e::apply_rotary         4.69%      39.959us         9.27%      78.991us      13.165us      20.352us        66.39%      20.352us       3.392us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.352us        66.39%      20.352us       3.392us             6  
+                                            aten::clone         2.92%      24.890us        65.73%     560.410us      93.402us       0.000us         0.00%      12.032us       2.005us             6  
+                                            aten::copy_         4.20%      35.769us        59.19%     504.659us      84.110us      10.304us        33.61%      12.032us       2.005us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.304us        33.61%      10.304us       1.717us             6  
+                                Activity Buffer Request        30.61%     260.975us        30.61%     260.975us     260.975us       1.728us         5.64%       1.728us       1.728us             1  
+                                    aten::empty_strided         3.62%      30.861us         3.62%      30.861us       5.143us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.39%     207.915us        24.39%     207.915us      34.652us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.54%      30.221us         4.51%      38.491us       3.208us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.97%       8.270us         0.97%       8.270us       0.689us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.58%      39.032us         4.58%      39.032us       6.505us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.460us         0.64%       5.460us       5.460us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 805.340us
-Self CUDA time total: 30.430us
+Self CPU time total: 852.554us
+Self CUDA time total: 30.656us
 
 
 
@@ -4669,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     358.905us       840.15%     358.905us     358.905us             1  
-                                      hf_kernels_rotary        15.26%     159.123us        99.55%       1.038ms       1.038ms       0.000us         0.00%      45.598us      45.598us             1  
-                          _rotary_dba7d1e::apply_rotary         4.27%      44.490us         8.42%      87.790us      14.632us      25.600us        59.93%      25.600us       4.267us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.600us        59.93%      25.600us       4.267us             6  
-                                            aten::clone         2.23%      23.211us        71.54%     746.059us     124.343us       0.000us         0.00%      19.998us       3.333us             6  
-                                            aten::copy_         3.70%      38.572us        65.96%     687.817us     114.636us      17.119us        40.07%      19.998us       3.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.119us        40.07%      17.119us       2.853us             6  
-                                Activity Buffer Request        44.90%     468.242us        44.90%     468.242us     468.242us       2.879us         6.74%       2.879us       2.879us             1  
-                                    aten::empty_strided         3.36%      35.031us         3.36%      35.031us       5.838us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.36%     181.003us        17.36%     181.003us      30.167us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.32%      34.604us         4.33%      45.135us       3.761us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.01%      10.531us         1.01%      10.531us       0.878us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.15%      43.300us         4.15%      43.300us       7.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.45%       4.700us         0.45%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.702us       766.04%     328.702us     328.702us             1  
+                                      hf_kernels_rotary        18.09%     152.853us        99.33%     839.363us     839.363us       0.000us         0.00%      45.788us      45.788us             1  
+                          _rotary_dba7d1e::apply_rotary         4.68%      39.541us         9.21%      77.782us      12.964us      25.887us        60.33%      25.887us       4.314us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.887us        60.33%      25.887us       4.314us             6  
+                                            aten::clone         2.66%      22.468us        67.35%     569.108us      94.851us       0.000us         0.00%      19.901us       3.317us             6  
+                                            aten::copy_         4.16%      35.173us        60.88%     514.450us      85.742us      17.022us        39.67%      19.901us       3.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.022us        39.67%      17.022us       2.837us             6  
+                                Activity Buffer Request        32.07%     270.965us        32.07%     270.965us     270.965us       2.879us         6.71%       2.879us       2.879us             1  
+                                    aten::empty_strided         3.81%      32.190us         3.81%      32.190us       5.365us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.65%     208.312us        24.65%     208.312us      34.719us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.71%      31.390us         4.69%      39.620us       3.302us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.97%       8.230us         0.97%       8.230us       0.686us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.53%      38.241us         4.53%      38.241us       6.374us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.67%       5.631us         0.67%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.043ms
-Self CUDA time total: 42.719us
+Self CPU time total: 844.994us
+Self CUDA time total: 42.909us
 
 
 
@@ -4695,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     383.638us       432.19%     383.638us     383.638us             1  
-                                      hf_kernels_rotary        19.20%     158.364us        99.38%     819.611us     819.611us       0.000us         0.00%     103.870us     103.870us             1  
-                                            aten::clone         2.74%      22.581us        61.51%     507.313us      84.552us       0.000us         0.00%      63.135us      10.522us             6  
-                                            aten::copy_         4.83%      39.811us        54.76%     451.622us      75.270us      48.031us        54.11%      63.135us      10.522us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      48.031us        54.11%      48.031us       8.005us             6  
-                          _rotary_dba7d1e::apply_rotary         5.49%      45.243us        13.16%     108.504us      18.084us      40.735us        45.89%      40.735us       6.789us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      40.735us        45.89%      40.735us       6.789us             6  
-                                Activity Buffer Request        27.50%     226.825us        27.50%     226.825us     226.825us      15.104us        17.02%      15.104us      15.104us             1  
-                                    aten::empty_strided         4.01%      33.110us         4.01%      33.110us       5.518us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.43%     184.986us        22.43%     184.986us      30.831us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.25%      35.021us         5.51%      45.430us       3.786us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.26%      10.409us         1.26%      10.409us       0.867us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         7.67%      63.261us         7.67%      63.261us      10.543us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.62%       5.141us         0.62%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     337.246us       364.66%     337.246us     337.246us             1  
+                                      hf_kernels_rotary         7.43%     178.431us        99.78%       2.398ms       2.398ms       0.000us         0.00%     107.425us     107.425us             1  
+                                            aten::clone         1.14%      27.439us        87.31%       2.098ms     349.642us       0.000us         0.00%      65.823us      10.970us             6  
+                                            aten::copy_         1.39%      33.333us        84.85%       2.039ms     339.779us      50.880us        55.02%      65.823us      10.970us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      50.880us        55.02%      50.880us       8.480us             6  
+                          _rotary_dba7d1e::apply_rotary         1.70%      40.740us         3.29%      79.070us      13.178us      41.602us        44.98%      41.602us       6.934us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.602us        44.98%      41.602us       6.934us             6  
+                                Activity Buffer Request        74.72%       1.795ms        74.72%       1.795ms       1.795ms      14.943us        16.16%      14.943us      14.943us             1  
+                                    aten::empty_strided         1.32%      31.741us         1.32%      31.741us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.74%     209.903us         8.74%     209.903us      34.984us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.35%      32.344us         1.76%      42.183us       3.515us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.41%       9.839us         0.41%       9.839us       0.820us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.60%      38.330us         1.60%      38.330us       6.388us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       5.280us         0.22%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 824.752us
-Self CUDA time total: 88.766us
+Self CPU time total: 2.403ms
+Self CUDA time total: 92.482us
 
 
 
@@ -4721,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     359.259us       247.18%     359.259us     359.259us             1  
-                                      hf_kernels_rotary        19.06%     158.337us        99.39%     825.781us     825.781us       0.000us         0.00%     168.829us     168.829us             1  
-                                            aten::clone         2.83%      23.549us        64.09%     532.493us      88.749us       0.000us         0.00%     105.470us      17.578us             6  
-                                            aten::copy_         4.58%      38.013us        57.29%     475.972us      79.329us      81.982us        56.41%     105.470us      17.578us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.982us        56.41%      81.982us      13.664us             6  
-                          _rotary_dba7d1e::apply_rotary         5.47%      45.451us        10.86%      90.251us      15.042us      63.359us        43.59%      63.359us      10.560us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.359us        43.59%      63.359us      10.560us             6  
-                                Activity Buffer Request        31.29%     259.966us        31.29%     259.966us     259.966us      23.488us        16.16%      23.488us      23.488us             1  
-                                    aten::empty_strided         3.97%      32.972us         3.97%      32.972us       5.495us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        21.42%     177.993us        21.42%     177.993us      29.665us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.19%      34.839us         5.38%      44.700us       3.725us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.19%       9.861us         1.19%       9.861us       0.822us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.39%      44.800us         5.39%      44.800us       7.467us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.61%       5.100us         0.61%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     331.357us       227.98%     331.357us     331.357us             1  
+                                      hf_kernels_rotary        19.22%     153.403us        99.38%     793.253us     793.253us       0.000us         0.00%     169.054us     169.054us             1  
+                                            aten::clone         2.47%      19.681us        65.33%     521.479us      86.913us       0.000us         0.00%     105.151us      17.525us             6  
+                                            aten::copy_         4.41%      35.219us        59.11%     471.788us      78.631us      81.439us        56.03%     105.151us      17.525us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.439us        56.03%      81.439us      13.573us             6  
+                          _rotary_dba7d1e::apply_rotary         5.09%      40.640us         9.93%      79.270us      13.212us      63.903us        43.97%      63.903us      10.650us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.903us        43.97%      63.903us      10.650us             6  
+                                Activity Buffer Request        29.11%     232.364us        29.11%     232.364us     232.364us      23.712us        16.31%      23.712us      23.712us             1  
+                                    aten::empty_strided         3.76%      30.010us         3.76%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.58%     204.205us        25.58%     204.205us      34.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.78%      30.171us         4.90%      39.101us       3.258us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.12%       8.930us         1.12%       8.930us       0.744us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.84%      38.630us         4.84%      38.630us       6.438us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       4.940us         0.62%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 830.881us
-Self CUDA time total: 145.341us
+Self CPU time total: 798.193us
+Self CUDA time total: 145.342us
 
 
 
@@ -4747,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     385.725us       509.05%     385.725us     385.725us             1  
-                                      hf_kernels_rotary         8.62%     176.456us        99.78%       2.043ms       2.043ms       0.000us         0.00%      82.558us      82.558us             1  
-                          _rotary_dba7d1e::apply_rotary         2.32%      47.603us         4.41%      90.273us      15.045us      41.694us        55.02%      41.694us       6.949us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.694us        55.02%      41.694us       6.949us             6  
-                                            aten::clone         1.42%      29.000us        84.54%       1.731ms     288.534us       0.000us         0.00%      40.864us       6.811us             6  
-                                            aten::copy_         1.93%      39.552us        80.14%       1.641ms     273.497us      34.080us        44.98%      40.864us       6.811us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      34.080us        44.98%      34.080us       5.680us             6  
-                                Activity Buffer Request        69.16%       1.416ms        69.16%       1.416ms       1.416ms       6.784us         8.95%       6.784us       6.784us             1  
-                                    aten::empty_strided         2.99%      61.221us         2.99%      61.221us      10.204us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.05%     185.224us         9.05%     185.224us      30.871us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.69%      34.591us         2.21%      45.260us       3.772us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.52%      10.669us         0.52%      10.669us       0.889us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.08%      42.670us         2.08%      42.670us       7.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       4.530us         0.22%       4.530us       4.530us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     327.384us       410.23%     327.384us     327.384us             1  
+                                      hf_kernels_rotary        18.75%     148.421us        99.39%     786.852us     786.852us       0.000us         0.00%      89.981us      89.981us             1  
+                                            aten::clone         2.67%      21.153us        65.81%     521.010us      86.835us       0.000us         0.00%      47.613us       7.935us             6  
+                                            aten::copy_         4.62%      36.560us        59.19%     468.587us      78.098us      37.437us        46.91%      47.613us       7.935us             6  
+                          _rotary_dba7d1e::apply_rotary         5.10%      40.369us         9.95%      78.790us      13.132us      42.368us        53.09%      42.368us       7.061us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.368us        53.09%      42.368us       7.061us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.437us        46.91%      37.437us       6.240us             6  
+                                Activity Buffer Request        28.86%     228.474us        28.86%     228.474us     228.474us      10.176us        12.75%      10.176us      10.176us             1  
+                                    aten::empty_strided         3.95%      31.270us         3.95%      31.270us       5.212us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.71%     203.553us        25.71%     203.553us      33.925us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.86%      30.542us         4.88%      38.631us       3.219us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.02%       8.089us         1.02%       8.089us       0.674us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.85%      38.421us         4.85%      38.421us       6.403us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.61%       4.869us         0.61%       4.869us       4.869us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.048ms
-Self CUDA time total: 75.774us
+Self CPU time total: 791.721us
+Self CUDA time total: 79.805us
 
 
 
@@ -4773,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     368.925us       253.94%     368.925us     368.925us             1  
-                                      hf_kernels_rotary         8.62%     177.641us        99.74%       2.055ms       2.055ms       0.000us         0.00%     169.118us     169.118us             1  
-                                            aten::clone         1.42%      29.322us        84.62%       1.743ms     290.539us       0.000us         0.00%     105.470us      17.578us             6  
-                                            aten::copy_         1.92%      39.462us        81.52%       1.679ms     279.897us      81.631us        56.19%     105.470us      17.578us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.631us        56.19%      81.631us      13.605us             6  
-                          _rotary_dba7d1e::apply_rotary         2.27%      46.683us         4.40%      90.665us      15.111us      63.648us        43.81%      63.648us      10.608us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.648us        43.81%      63.648us      10.608us             6  
-                                Activity Buffer Request        70.79%       1.458ms        70.79%       1.458ms       1.458ms      23.839us        16.41%      23.839us      23.839us             1  
-                                    aten::empty_strided         1.68%      34.530us         1.68%      34.530us       5.755us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.81%     181.504us         8.81%     181.504us      30.251us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.62%      33.289us         2.09%      43.080us       3.590us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%       9.791us         0.48%       9.791us       0.816us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.13%      43.982us         2.13%      43.982us       7.330us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       5.450us         0.26%       5.450us       5.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     334.133us       229.04%     334.133us     334.133us             1  
+                                      hf_kernels_rotary        18.91%     152.747us        99.33%     802.303us     802.303us       0.000us         0.00%     169.593us     169.593us             1  
+                                            aten::clone         2.63%      21.282us        65.81%     531.500us      88.583us       0.000us         0.00%     105.244us      17.541us             6  
+                                            aten::copy_         4.22%      34.070us        59.15%     477.709us      79.618us      81.533us        55.89%     105.244us      17.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.533us        55.89%      81.533us      13.589us             6  
+                          _rotary_dba7d1e::apply_rotary         4.95%      39.971us         9.71%      78.412us      13.069us      64.349us        44.11%      64.349us      10.725us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.349us        44.11%      64.349us      10.725us             6  
+                                Activity Buffer Request        29.92%     241.694us        29.92%     241.694us     241.694us      23.711us        16.25%      23.711us      23.711us             1  
+                                    aten::empty_strided         4.02%      32.509us         4.02%      32.509us       5.418us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        25.00%     201.945us        25.00%     201.945us      33.657us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.87%      31.225us         4.91%      39.644us       3.304us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.04%       8.419us         1.04%       8.419us       0.702us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.76%      38.441us         4.76%      38.441us       6.407us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.67%       5.380us         0.67%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.060ms
-Self CUDA time total: 145.279us
+Self CPU time total: 807.683us
+Self CUDA time total: 145.882us
 
 
 
@@ -4799,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        20.72%     223.838us        78.32%     845.992us     845.992us       0.000us         0.00%     747.476us     747.476us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     688.117us       101.15%     688.117us     688.117us             1  
-                                            aten::clone         2.05%      22.091us        45.23%     488.522us      81.420us       0.000us         0.00%     558.423us      93.070us             6  
-                                            aten::copy_         3.67%      39.650us        40.20%     434.190us      72.365us     491.256us        72.21%     558.423us      93.070us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     491.256us        72.21%     491.256us      81.876us             6  
-                          _rotary_dba7d1e::apply_rotary         4.18%      45.161us         8.45%      91.252us      15.209us     189.053us        27.79%     189.053us      31.509us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     189.053us        27.79%     189.053us      31.509us             6  
-                                Activity Buffer Request        19.62%     211.896us        19.62%     211.896us     211.896us      67.167us         9.87%      67.167us      67.167us             1  
-                                    aten::empty_strided         2.98%      32.241us         2.98%      32.241us       5.374us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.91%     182.644us        16.91%     182.644us      30.441us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.05%      32.939us         3.92%      42.380us       3.532us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.87%       9.441us         0.87%       9.441us       0.787us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.27%      46.091us         4.27%      46.091us       7.682us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        21.68%     234.186us        21.68%     234.186us     234.186us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        13.54%     152.254us        71.57%     804.992us     804.992us       0.000us         0.00%     741.111us     741.111us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     682.359us       101.20%     682.359us     682.359us             1  
+                                            aten::clone         1.94%      21.788us        47.45%     533.747us      88.958us       0.000us         0.00%     557.274us      92.879us             6  
+                                            aten::copy_         3.08%      34.611us        42.75%     480.788us      80.131us     490.426us        72.74%     557.274us      92.879us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     490.426us        72.74%     490.426us      81.738us             6  
+                          _rotary_dba7d1e::apply_rotary         3.61%      40.571us         7.01%      78.811us      13.135us     183.837us        27.26%     183.837us      30.639us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     183.837us        27.26%     183.837us      30.639us             6  
+                                Activity Buffer Request        21.83%     245.524us        21.83%     245.524us     245.524us      66.848us         9.91%      66.848us      66.848us             1  
+                                    aten::empty_strided         2.77%      31.171us         2.77%      31.171us       5.195us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.84%     200.653us        17.84%     200.653us      33.442us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.81%      31.570us         3.57%      40.180us       3.348us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.77%       8.610us         0.77%       8.610us       0.718us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.40%      38.240us         3.40%      38.240us       6.373us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        28.43%     319.765us        28.43%     319.765us     319.765us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.080ms
-Self CUDA time total: 680.309us
+Self CPU time total: 1.125ms
+Self CUDA time total: 674.263us
 
 
 
@@ -4825,33 +4607,33 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         5.41%     154.946us        27.83%     797.061us     797.061us       0.000us         0.00%       2.625ms       2.625ms             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.453ms       100.31%       2.453ms       2.453ms             1  
-                                            aten::clone         0.79%      22.601us        17.83%     510.683us      85.114us       0.000us         0.00%       1.396ms     232.586us             6  
-                                            aten::copy_         1.43%      40.940us        15.89%     455.120us      75.853us       1.216ms        49.74%       1.396ms     232.586us             6  
-                          _rotary_dba7d1e::apply_rotary         1.59%      45.590us         3.06%      87.640us      14.607us       1.229ms        50.26%       1.229ms     204.885us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.229ms        50.26%       1.229ms     204.885us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.216ms        49.74%       1.216ms     202.730us             6  
-                                Activity Buffer Request         7.23%     207.076us         7.23%     207.076us     207.076us     179.136us         7.32%     179.136us     179.136us             1  
-                                    aten::empty_strided         1.15%      32.962us         1.15%      32.962us       5.494us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.23%     207.104us         7.23%     207.104us      34.517us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.15%      33.011us         1.53%      43.792us       3.649us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%      10.781us         0.38%      10.781us       0.898us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.47%      42.050us         1.47%      42.050us       7.008us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        72.17%       2.067ms        72.17%       2.067ms       2.067ms       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         5.26%     152.407us        28.24%     818.853us     818.853us       0.000us         0.00%       2.611ms       2.611ms             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.442ms       100.34%       2.442ms       2.442ms             1  
+                                            aten::clone         0.72%      20.941us        18.92%     548.700us      91.450us       0.000us         0.00%       1.390ms     231.619us             6  
+                                            aten::copy_         1.19%      34.511us        17.07%     495.108us      82.518us       1.212ms        49.82%       1.390ms     231.619us             6  
+                          _rotary_dba7d1e::apply_rotary         1.41%      40.761us         2.75%      79.892us      13.315us       1.221ms        50.18%       1.221ms     203.523us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.221ms        50.18%       1.221ms     203.523us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.212ms        49.82%       1.212ms     202.067us             6  
+                                Activity Buffer Request         8.94%     259.144us         8.94%     259.144us     259.144us     177.311us         7.29%     177.311us     177.311us             1  
+                                    aten::empty_strided         1.13%      32.651us         1.13%      32.651us       5.442us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.95%     201.453us         6.95%     201.453us      33.575us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.03%      29.842us         1.31%      37.854us       3.154us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.28%       8.012us         0.28%       8.012us       0.668us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.35%      39.131us         1.35%      39.131us       6.522us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        71.76%       2.081ms        71.76%       2.081ms       2.081ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.864ms
-Self CUDA time total: 2.446ms
+Self CPU time total: 2.900ms
+Self CUDA time total: 2.434ms
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
-hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.07  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
-hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.10  True
-hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.10  True
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  True
@@ -4862,7 +4644,7 @@ hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.85  True
-hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.27  True
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.26  True
 hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  True
@@ -4873,12 +4655,12 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
 
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 60%|██████ | 3/5 [00:00<00:00, 28.46it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.80it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.23it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.22it/s]

Artifacts:

rotary.jsonl