diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - ; -} - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - - { - % endif % -} - - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.22s | Raw @@ -4123,16 +3905,16 @@ Cell: nv | 0.21s
-
Fri Oct 31 20:00:25 2025       
+
Mon Nov 10 21:57:49 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   27C    P0             77W /  350W |       0MiB /  46068MiB |     18%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4156,7 +3938,7 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 9.11s
+Cell: benchmark | 10.37s
  | 
 
 Raw
@@ -4210,19 +3992,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     180.703us      4446.43%     180.703us     180.703us             1  
-                               hf_kernels_causal_conv1d         8.48%     160.534us        99.62%       1.886ms       1.886ms       0.000us         0.00%       5.504us       5.504us             1  
-                                         CausalConv1dFn         6.47%     122.423us        91.15%       1.726ms     575.261us       0.000us         0.00%       5.504us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.51%      28.612us        80.84%       1.531ms     510.207us       4.064us       100.00%       5.504us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        76.71%       1.452ms        76.71%       1.452ms       1.452ms       1.440us        35.43%       1.440us       1.440us             1  
-                                       aten::empty_like         1.07%      20.220us         3.84%      72.741us      24.247us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.77%      52.521us         2.77%      52.521us      17.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.62%      49.571us         2.62%      49.571us      16.524us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.38%       7.101us         0.38%       7.101us       7.101us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     156.321us      3758.62%     156.321us     156.321us             1  
+                               hf_kernels_causal_conv1d         6.87%     159.072us        99.36%       2.300ms       2.300ms       0.000us         0.00%       5.599us       5.599us             1  
+                                         CausalConv1dFn         4.82%     111.622us        92.49%       2.141ms     713.785us       0.000us         0.00%       5.599us       1.866us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.19%      27.462us        84.76%       1.962ms     654.127us       4.159us       100.00%       5.599us       1.866us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.159us       100.00%       4.159us       1.386us             3  
+                                Activity Buffer Request        81.39%       1.884ms        81.39%       1.884ms       1.884ms       1.440us        34.62%       1.440us       1.440us             1  
+                                       aten::empty_like         0.94%      21.650us         2.91%      67.351us      22.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.97%      45.701us         1.97%      45.701us      15.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.18%      50.500us         2.18%      50.500us      16.833us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.64%      14.811us         0.64%      14.811us      14.811us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.893ms
-Self CUDA time total: 4.064us
+Self CPU time total: 2.315ms
+Self CUDA time total: 4.159us
 
 
 
@@ -4232,19 +4014,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.791us      3331.33%     125.791us     125.791us             1  
-                               hf_kernels_causal_conv1d         5.58%      96.392us        99.64%       1.721ms       1.721ms       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn         4.40%      76.074us        94.06%       1.625ms     541.671us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.52%      26.231us        87.95%       1.519ms     506.473us       3.776us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
-                                Activity Buffer Request        84.56%       1.461ms        84.56%       1.461ms       1.461ms       1.280us        33.90%       1.280us       1.280us             1  
-                                       aten::empty_like         0.44%       7.590us         1.71%      29.520us       9.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.27%      21.930us         1.27%      21.930us       7.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.87%      32.290us         1.87%      32.290us      10.763us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.36%       6.200us         0.36%       6.200us       6.200us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.455us      3297.41%     123.455us     123.455us             1  
+                               hf_kernels_causal_conv1d         4.13%      83.101us        99.73%       2.009ms       2.009ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.66%      73.760us        95.61%       1.926ms     641.917us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      23.071us        90.47%       1.822ms     607.420us       3.744us       100.00%       4.992us       1.664us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
+                                Activity Buffer Request        87.83%       1.769ms        87.83%       1.769ms       1.769ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.39%       7.860us         1.48%      29.730us       9.910us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.09%      21.870us         1.09%      21.870us       7.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.49%      30.082us         1.49%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.421us         0.27%       5.421us       5.421us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.728ms
-Self CUDA time total: 3.776us
+Self CPU time total: 2.014ms
+Self CUDA time total: 3.744us
 
 
 
@@ -4254,19 +4036,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.758us      3330.46%     125.758us     125.758us             1  
-                               hf_kernels_causal_conv1d         5.23%      90.742us        99.66%       1.729ms       1.729ms       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn         4.39%      76.092us        94.43%       1.638ms     546.081us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.50%      26.031us        88.31%       1.532ms     510.660us       3.776us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
-                                Activity Buffer Request        84.98%       1.474ms        84.98%       1.474ms       1.474ms       1.280us        33.90%       1.280us       1.280us             1  
-                                       aten::empty_like         0.47%       8.201us         1.74%      30.171us      10.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.27%      21.970us         1.27%      21.970us       7.323us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.83%      31.671us         1.83%      31.671us      10.557us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.34%       5.850us         0.34%       5.850us       5.850us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.263us      3185.44%     119.263us     119.263us             1  
+                               hf_kernels_causal_conv1d         3.91%      78.640us        99.72%       2.003ms       2.003ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.57%      71.661us        95.80%       1.925ms     641.537us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      22.781us        90.75%       1.823ms     607.693us       3.744us       100.00%       4.992us       1.664us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
+                                Activity Buffer Request        88.14%       1.771ms        88.14%       1.771ms       1.771ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.41%       8.160us         1.49%      29.872us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.08%      21.712us         1.08%      21.712us       7.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.48%      29.670us         1.48%      29.670us       9.890us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.669us         0.28%       5.669us       5.669us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.735ms
-Self CUDA time total: 3.776us
+Self CPU time total: 2.009ms
+Self CUDA time total: 3.744us
 
 
 
@@ -4276,19 +4058,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.584us      3350.42%     127.584us     127.584us             1  
-                               hf_kernels_causal_conv1d         4.53%      88.983us        99.75%       1.962ms       1.962ms       0.000us         0.00%       5.088us       5.088us             1  
-                                         CausalConv1dFn         3.93%      77.252us        95.23%       1.873ms     624.219us       0.000us         0.00%       5.088us       1.696us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      26.710us        89.83%       1.766ms     588.805us       3.808us       100.00%       5.088us       1.696us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
-                                Activity Buffer Request        74.34%       1.462ms        74.34%       1.462ms       1.462ms       1.280us        33.61%       1.280us       1.280us             1  
-                                       aten::empty_like         0.41%       8.060us         1.47%      28.990us       9.663us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.06%      20.930us         1.06%      20.930us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        14.13%     277.777us        14.13%     277.777us      92.592us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.831us         0.25%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.790us      3253.81%     121.790us     121.790us             1  
+                               hf_kernels_causal_conv1d         3.48%      76.970us        99.77%       2.208ms       2.208ms       0.000us         0.00%       4.991us       4.991us             1  
+                                         CausalConv1dFn         3.33%      73.753us        96.30%       2.131ms     710.368us       0.000us         0.00%       4.991us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.03%      22.770us        91.66%       2.029ms     676.184us       3.743us       100.00%       4.991us       1.664us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.743us       100.00%       3.743us       1.248us             3  
+                                Activity Buffer Request        81.47%       1.803ms        81.47%       1.803ms       1.803ms       1.248us        33.34%       1.248us       1.248us             1  
+                                       aten::empty_like         0.36%       7.858us         1.30%      28.800us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.95%      20.942us         0.95%      20.942us       6.981us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.17%     202.863us         9.17%     202.863us      67.621us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       4.991us         0.23%       4.991us       4.991us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.966ms
-Self CUDA time total: 3.808us
+Self CPU time total: 2.213ms
+Self CUDA time total: 3.743us
 
 
 
@@ -4298,19 +4080,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.686us      2639.84%     126.686us     126.686us             1  
-                               hf_kernels_causal_conv1d         4.55%      87.622us        99.73%       1.920ms       1.920ms       0.000us         0.00%       6.430us       6.430us             1  
-                                         CausalConv1dFn         3.92%      75.482us        95.18%       1.832ms     610.789us       0.000us         0.00%       6.430us       2.143us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      27.663us        89.66%       1.726ms     575.372us       4.799us       100.00%       6.430us       2.143us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.799us       100.00%       4.799us       1.600us             3  
-                                Activity Buffer Request        74.49%       1.434ms        74.49%       1.434ms       1.434ms       1.631us        33.99%       1.631us       1.631us             1  
-                                       aten::empty_like         0.42%       8.140us         1.60%      30.770us      10.257us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.18%      22.630us         1.18%      22.630us       7.543us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        13.74%     264.526us        13.74%     264.526us      88.175us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.120us         0.27%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.073us      2547.57%     123.073us     123.073us             1  
+                               hf_kernels_causal_conv1d         3.82%      79.680us        99.75%       2.083ms       2.083ms       0.000us         0.00%       6.463us       6.463us             1  
+                                         CausalConv1dFn         3.53%      73.692us        95.93%       2.003ms     667.744us       0.000us         0.00%       6.463us       2.154us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      24.371us        90.98%       1.900ms     633.257us       4.831us       100.00%       6.463us       2.154us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.831us       100.00%       4.831us       1.610us             3  
+                                Activity Buffer Request        81.73%       1.707ms        81.73%       1.707ms       1.707ms       1.632us        33.78%       1.632us       1.632us             1  
+                                       aten::empty_like         0.42%       8.791us         1.43%      29.771us       9.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.00%      20.980us         1.00%      20.980us       6.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.08%     168.682us         8.08%     168.682us      56.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.250us         0.25%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.925ms
-Self CUDA time total: 4.799us
+Self CPU time total: 2.088ms
+Self CUDA time total: 4.831us
 
 
 
@@ -4320,19 +4102,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.083us      2423.58%     117.083us     117.083us             1  
-                               hf_kernels_causal_conv1d        12.24%      83.203us        99.28%     674.957us     674.957us       0.000us         0.00%       6.463us       6.463us             1  
-                                         CausalConv1dFn        10.43%      70.911us        87.04%     591.754us     197.251us       0.000us         0.00%       6.463us       2.154us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.93%      26.710us        72.18%     490.682us     163.561us       4.831us       100.00%       6.463us       2.154us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.831us       100.00%       4.831us       1.610us             3  
-                                Activity Buffer Request        32.42%     220.416us        32.42%     220.416us     220.416us       1.632us        33.78%       1.632us       1.632us             1  
-                                       aten::empty_like         1.07%       7.270us         4.44%      30.161us      10.054us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.37%      22.891us         3.37%      22.891us       7.630us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.83%     243.556us        35.83%     243.556us      81.185us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.72%       4.870us         0.72%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.883us      2373.55%     113.883us     113.883us             1  
+                               hf_kernels_causal_conv1d        15.03%      75.250us        99.01%     495.717us     495.717us       0.000us         0.00%       6.430us       6.430us             1  
+                                         CausalConv1dFn        13.70%      68.601us        83.98%     420.467us     140.156us       0.000us         0.00%       6.430us       2.143us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.03%      25.190us        64.69%     323.874us     107.958us       4.798us       100.00%       6.430us       2.143us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.798us       100.00%       4.798us       1.599us             3  
+                                Activity Buffer Request        28.01%     140.222us        28.01%     140.222us     140.222us       1.632us        34.01%       1.632us       1.632us             1  
+                                       aten::empty_like         1.45%       7.260us         5.59%      27.992us       9.331us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.14%      20.732us         4.14%      20.732us       6.911us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.65%     158.462us        31.65%     158.462us      52.821us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.99%       4.940us         0.99%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 679.827us
-Self CUDA time total: 4.831us
+Self CPU time total: 500.657us
+Self CUDA time total: 4.798us
 
 
 
@@ -4342,19 +4124,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.381us      1167.35%     124.381us     124.381us             1  
-                               hf_kernels_causal_conv1d         4.48%      85.542us        99.75%       1.904ms       1.904ms       0.000us         0.00%      14.271us      14.271us             1  
-                                         CausalConv1dFn         3.83%      73.182us        95.27%       1.819ms     606.282us       0.000us         0.00%      14.271us       4.757us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.960us        89.88%       1.716ms     571.988us      10.655us       100.00%      14.271us       4.757us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.655us       100.00%      10.655us       3.552us             3  
-                                Activity Buffer Request        76.01%       1.451ms        76.01%       1.451ms       1.451ms       3.616us        33.94%       3.616us       3.616us             1  
-                                       aten::empty_like         0.43%       8.120us         1.56%      29.700us       9.900us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.13%      21.580us         1.13%      21.580us       7.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.45%     237.787us        12.45%     237.787us      79.262us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.860us         0.25%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.365us      1148.32%     122.365us     122.365us             1  
+                               hf_kernels_causal_conv1d         3.51%      76.530us        99.77%       2.176ms       2.176ms       0.000us         0.00%      14.208us      14.208us             1  
+                                         CausalConv1dFn         3.29%      71.713us        96.26%       2.099ms     699.771us       0.000us         0.00%      14.208us       4.736us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.11%      24.170us        91.65%       1.999ms     666.274us      10.656us       100.00%      14.208us       4.736us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us       100.00%      10.656us       3.552us             3  
+                                Activity Buffer Request        82.90%       1.808ms        82.90%       1.808ms       1.808ms       3.552us        33.33%       3.552us       3.552us             1  
+                                       aten::empty_like         0.37%       8.070us         1.32%      28.780us       9.593us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.95%      20.710us         0.95%      20.710us       6.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.64%     166.713us         7.64%     166.713us      55.571us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.051us         0.23%       5.051us       5.051us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.909ms
-Self CUDA time total: 10.655us
+Self CPU time total: 2.181ms
+Self CUDA time total: 10.656us
 
 
 
@@ -4364,19 +4146,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.652us      1120.72%     122.652us     122.652us             1  
-                               hf_kernels_causal_conv1d        12.91%      86.303us        99.27%     663.588us     663.588us       0.000us         0.00%      14.624us      14.624us             1  
-                                         CausalConv1dFn        10.74%      71.821us        86.36%     577.285us     192.428us       0.000us         0.00%      14.624us       4.875us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.81%      25.480us        71.21%     476.023us     158.674us      10.944us       100.00%      14.624us       4.875us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.944us       100.00%      10.944us       3.648us             3  
-                                Activity Buffer Request        32.82%     219.426us        32.82%     219.426us     219.426us       3.680us        33.63%       3.680us       3.680us             1  
-                                       aten::empty_like         1.14%       7.591us         4.40%      29.441us       9.814us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.27%      21.850us         3.27%      21.850us       7.283us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.57%     231.117us        34.57%     231.117us      77.039us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.73%       4.900us         0.73%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.676us      1057.08%     115.676us     115.676us             1  
+                               hf_kernels_causal_conv1d        15.90%      75.141us        98.97%     467.777us     467.777us       0.000us         0.00%      14.654us      14.654us             1  
+                                         CausalConv1dFn        14.89%      70.359us        83.07%     392.636us     130.879us       0.000us         0.00%      14.654us       4.885us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.95%      23.391us        62.24%     294.186us      98.062us      10.943us       100.00%      14.654us       4.885us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
+                                Activity Buffer Request        23.54%     111.281us        23.54%     111.281us     111.281us       3.711us        33.91%       3.711us       3.711us             1  
+                                       aten::empty_like         1.66%       7.830us         5.94%      28.091us       9.364us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.29%      20.261us         4.29%      20.261us       6.754us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.75%     159.514us        33.75%     159.514us      53.171us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.03%       4.890us         1.03%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 668.488us
-Self CUDA time total: 10.944us
+Self CPU time total: 472.667us
+Self CUDA time total: 10.943us
 
 
 
@@ -4386,19 +4168,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.430us      1181.43%     130.430us     130.430us             1  
-                               hf_kernels_causal_conv1d         4.23%      79.341us        99.73%       1.871ms       1.871ms       0.000us         0.00%      14.784us      14.784us             1  
-                                         CausalConv1dFn         4.03%      75.521us        95.50%       1.792ms     597.206us       0.000us         0.00%      14.784us       4.928us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      26.810us        89.82%       1.685ms     561.675us      11.040us       100.00%      14.784us       4.928us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us       100.00%      11.040us       3.680us             3  
-                                Activity Buffer Request        77.07%       1.446ms        77.07%       1.446ms       1.446ms       3.744us        33.91%       3.744us       3.744us             1  
-                                       aten::empty_like         0.44%       8.272us         1.66%      31.072us      10.357us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.22%      22.800us         1.22%      22.800us       7.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        11.32%     212.286us        11.32%     212.286us      70.762us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.130us         0.27%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.422us      1124.47%     123.422us     123.422us             1  
+                               hf_kernels_causal_conv1d         3.69%      77.100us        99.75%       2.084ms       2.084ms       0.000us         0.00%      14.656us      14.656us             1  
+                                         CausalConv1dFn         3.52%      73.471us        96.06%       2.007ms     668.988us       0.000us         0.00%      14.656us       4.885us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.660us        90.70%       1.895ms     631.647us      10.976us       100.00%      14.656us       4.885us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
+                                Activity Buffer Request        81.81%       1.709ms        81.81%       1.709ms       1.709ms       3.680us        33.53%       3.680us       3.680us             1  
+                                       aten::empty_like         0.81%      17.020us         1.85%      38.551us      12.850us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.03%      21.531us         1.03%      21.531us       7.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.76%     162.104us         7.76%     162.104us      54.035us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.260us         0.25%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.876ms
-Self CUDA time total: 11.040us
+Self CPU time total: 2.089ms
+Self CUDA time total: 10.976us
 
 
 
@@ -4408,19 +4190,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.097us      1060.18%     120.097us     120.097us             1  
-                               hf_kernels_causal_conv1d        13.35%      76.301us        99.17%     566.674us     566.674us       0.000us         0.00%      15.168us      15.168us             1  
-                                         CausalConv1dFn        12.80%      73.153us        85.81%     490.373us     163.458us       0.000us         0.00%      15.168us       5.056us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.71%      26.911us        68.00%     388.569us     129.523us      11.328us       100.00%      15.168us       5.056us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.328us       100.00%      11.328us       3.776us             3  
-                                Activity Buffer Request        34.49%     197.075us        34.49%     197.075us     197.075us       3.840us        33.90%       3.840us       3.840us             1  
-                                       aten::empty_like         1.29%       7.379us         5.01%      28.651us       9.550us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.72%      21.272us         3.72%      21.272us       7.091us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        28.80%     164.583us        28.80%     164.583us      54.861us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       4.760us         0.83%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.952us      1044.29%     117.952us     117.952us             1  
+                               hf_kernels_causal_conv1d        16.01%      73.960us        98.90%     456.837us     456.837us       0.000us         0.00%      15.071us      15.071us             1  
+                                         CausalConv1dFn        15.53%      71.741us        82.89%     382.877us     127.626us       0.000us         0.00%      15.071us       5.024us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.93%      22.791us        61.20%     282.685us      94.228us      11.295us       100.00%      15.071us       5.024us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.295us       100.00%      11.295us       3.765us             3  
+                                Activity Buffer Request        21.70%     100.232us        21.70%     100.232us     100.232us       3.776us        33.43%       3.776us       3.776us             1  
+                                       aten::empty_like         1.73%       7.970us         6.16%      28.451us       9.484us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.43%      20.481us         4.43%      20.481us       6.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.57%     159.662us        34.57%     159.662us      53.221us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       5.060us         1.10%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 571.434us
-Self CUDA time total: 11.328us
+Self CPU time total: 461.897us
+Self CUDA time total: 11.295us
 
 
 
@@ -4430,19 +4212,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.919us       265.71%     133.919us     133.919us             1  
-                               hf_kernels_causal_conv1d         4.38%      80.552us        99.73%       1.836ms       1.836ms       0.000us         0.00%      83.873us      83.873us             1  
-                                         CausalConv1dFn         4.09%      75.353us        95.35%       1.755ms     585.145us       0.000us         0.00%      83.873us      27.958us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.33%      24.410us        89.50%       1.648ms     549.264us      50.401us       100.00%      83.873us      27.958us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.401us       100.00%      50.401us      16.800us             3  
-                                Activity Buffer Request        79.01%       1.455ms        79.01%       1.455ms       1.455ms      33.472us        66.41%      33.472us      33.472us             1  
-                                       aten::empty_like         0.45%       8.369us         1.75%      32.290us      10.763us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.30%      23.921us         1.30%      23.921us       7.974us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.17%     168.764us         9.17%     168.764us      56.255us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.020us         0.27%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.158us       256.57%     128.158us     128.158us             1  
+                               hf_kernels_causal_conv1d         3.51%      75.280us        99.75%       2.140ms       2.140ms       0.000us         0.00%      83.102us      83.102us             1  
+                                         CausalConv1dFn         3.36%      72.172us        96.24%       2.065ms     688.218us       0.000us         0.00%      83.102us      27.701us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      24.540us        91.55%       1.964ms     654.657us      49.951us       100.00%      83.102us      27.701us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.951us       100.00%      49.951us      16.650us             3  
+                                Activity Buffer Request        82.86%       1.778ms        82.86%       1.778ms       1.778ms      33.151us        66.37%      33.151us      33.151us             1  
+                                       aten::empty_like         0.37%       7.920us         1.33%      28.510us       9.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.96%      20.590us         0.96%      20.590us       6.863us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.54%     161.824us         7.54%     161.824us      53.941us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.290us         0.25%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.841ms
-Self CUDA time total: 50.401us
+Self CPU time total: 2.145ms
+Self CUDA time total: 49.951us
 
 
 
@@ -4452,19 +4234,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.005us       256.03%     131.005us     131.005us             1  
-                               hf_kernels_causal_conv1d        11.69%      77.241us        99.25%     655.717us     655.717us       0.000us         0.00%      85.534us      85.534us             1  
-                                         CausalConv1dFn        10.97%      72.503us        87.56%     578.476us     192.825us       0.000us         0.00%      85.534us      28.511us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.89%      25.692us        71.76%     474.103us     158.034us      51.167us       100.00%      85.534us      28.511us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.167us       100.00%      51.167us      17.056us             3  
-                                Activity Buffer Request        43.08%     284.587us        43.08%     284.587us     284.587us      34.367us        67.17%      34.367us      34.367us             1  
-                                       aten::empty_like         1.14%       7.549us         4.82%      31.870us      10.623us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.68%      24.321us         3.68%      24.321us       8.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.80%     163.824us        24.80%     163.824us      54.608us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.75%       4.929us         0.75%       4.929us       4.929us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.310us       261.10%     121.310us     121.310us             1  
+                               hf_kernels_causal_conv1d        16.42%      74.560us        98.88%     448.987us     448.987us       0.000us         0.00%      75.933us      75.933us             1  
+                                         CausalConv1dFn        15.28%      69.392us        82.46%     374.427us     124.809us       0.000us         0.00%      75.933us      25.311us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.01%      22.740us        60.80%     276.074us      92.025us      46.462us       100.00%      75.933us      25.311us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.462us       100.00%      46.462us      15.487us             3  
+                                Activity Buffer Request        21.27%      96.581us        21.27%      96.581us      96.581us      29.471us        63.43%      29.471us      29.471us             1  
+                                       aten::empty_like         1.63%       7.411us         6.38%      28.961us       9.654us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.75%      21.550us         4.75%      21.550us       7.183us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.52%     156.753us        34.52%     156.753us      52.251us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.12%       5.090us         1.12%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 660.646us
-Self CUDA time total: 51.167us
+Self CPU time total: 454.077us
+Self CUDA time total: 46.462us
 
 
 
@@ -4474,19 +4256,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.686us      3040.89%     118.686us     118.686us             1  
-                               hf_kernels_causal_conv1d        11.60%      73.750us        99.24%     631.216us     631.216us       0.000us         0.00%       5.183us       5.183us             1  
-                                         CausalConv1dFn        11.30%      71.845us        87.65%     557.466us     185.822us       0.000us         0.00%       5.183us       1.728us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.22%      26.861us        71.87%     457.101us     152.367us       3.903us       100.00%       5.183us       1.728us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.903us       100.00%       3.903us       1.301us             3  
-                                Activity Buffer Request        42.38%     269.577us        42.38%     269.577us     269.577us       1.280us        32.80%       1.280us       1.280us             1  
-                                       aten::empty_like         1.23%       7.810us         4.48%      28.520us       9.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.26%      20.710us         3.26%      20.710us       6.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        25.26%     160.663us        25.26%     160.663us      53.554us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.76%       4.821us         0.76%       4.821us       4.821us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.254us      3312.35%     128.254us     128.254us             1  
+                               hf_kernels_causal_conv1d         3.31%      74.540us        99.77%       2.245ms       2.245ms       0.000us         0.00%       5.120us       5.120us             1  
+                                         CausalConv1dFn         3.41%      76.802us        96.46%       2.170ms     723.418us       0.000us         0.00%       5.120us       1.707us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      24.209us        91.78%       2.065ms     688.374us       3.872us       100.00%       5.120us       1.707us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.872us       100.00%       3.872us       1.291us             3  
+                                Activity Buffer Request        83.69%       1.883ms        83.69%       1.883ms       1.883ms       1.248us        32.23%       1.248us       1.248us             1  
+                                       aten::empty_like         0.34%       7.679us         1.26%      28.331us       9.444us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.92%      20.652us         0.92%      20.652us       6.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.01%     157.803us         7.01%     157.803us      52.601us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.180us         0.23%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 636.037us
-Self CUDA time total: 3.903us
+Self CPU time total: 2.250ms
+Self CUDA time total: 3.872us
 
 
 
@@ -4496,19 +4278,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.221us      3029.76%     120.221us     120.221us             1  
-                               hf_kernels_causal_conv1d        13.01%      75.082us        99.09%     571.775us     571.775us       0.000us         0.00%       5.248us       5.248us             1  
-                                         CausalConv1dFn        12.35%      71.241us        86.08%     496.693us     165.564us       0.000us         0.00%       5.248us       1.749us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.88%      28.181us        68.58%     395.720us     131.907us       3.968us       100.00%       5.248us       1.749us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
-                                Activity Buffer Request        36.26%     209.246us        36.26%     209.246us     209.246us       1.280us        32.26%       1.280us       1.280us             1  
-                                       aten::empty_like         1.42%       8.172us         5.15%      29.732us       9.911us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.74%      21.560us         3.74%      21.560us       7.187us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.43%     158.293us        27.43%     158.293us      52.764us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.91%       5.270us         0.91%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.470us      3059.11%     117.470us     117.470us             1  
+                               hf_kernels_causal_conv1d        16.52%      75.490us        98.91%     451.907us     451.907us       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn        15.55%      71.061us        82.39%     376.417us     125.472us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.27%      24.090us        60.40%     275.984us      91.995us       3.840us       100.00%       5.056us       1.685us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
+                                Activity Buffer Request        20.75%      94.821us        20.75%      94.821us      94.821us       1.216us        31.67%       1.216us       1.216us             1  
+                                       aten::empty_like         1.80%       8.242us         6.43%      29.372us       9.791us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.62%      21.130us         4.62%      21.130us       7.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.38%     157.073us        34.38%     157.073us      52.358us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.09%       4.990us         1.09%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 577.045us
-Self CUDA time total: 3.968us
+Self CPU time total: 456.897us
+Self CUDA time total: 3.840us
 
 
 
@@ -4518,19 +4300,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.374us      2843.36%     117.374us     117.374us             1  
-                               hf_kernels_causal_conv1d        14.38%      74.792us        98.97%     514.843us     514.843us       0.000us         0.00%       5.504us       5.504us             1  
-                                         CausalConv1dFn        13.25%      68.940us        84.59%     440.051us     146.684us       0.000us         0.00%       5.504us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.99%      25.981us        65.51%     340.779us     113.593us       4.128us       100.00%       5.504us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.128us       100.00%       4.128us       1.376us             3  
-                                Activity Buffer Request        29.84%     155.214us        29.84%     155.214us     155.214us       1.376us        33.33%       1.376us       1.376us             1  
-                                       aten::empty_like         1.55%       8.080us         5.83%      30.332us      10.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.28%      22.252us         4.28%      22.252us       7.417us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.68%     159.584us        30.68%     159.584us      53.195us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.03%       5.380us         1.03%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.191us      2958.18%     120.191us     120.191us             1  
+                               hf_kernels_causal_conv1d         3.64%      78.360us        99.76%       2.149ms       2.149ms       0.000us         0.00%       5.406us       5.406us             1  
+                                         CausalConv1dFn         3.37%      72.531us        96.13%       2.071ms     690.275us       0.000us         0.00%       5.406us       1.802us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.05%      22.591us        91.41%       1.969ms     656.417us       4.063us       100.00%       5.406us       1.802us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
+                                Activity Buffer Request        83.09%       1.790ms        83.09%       1.790ms       1.790ms       1.343us        33.05%       1.343us       1.343us             1  
+                                       aten::empty_like         0.37%       8.020us         1.35%      29.041us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.98%      21.021us         0.98%      21.021us       7.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.27%     156.703us         7.27%     156.703us      52.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.100us         0.24%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 520.223us
-Self CUDA time total: 4.128us
+Self CPU time total: 2.154ms
+Self CUDA time total: 4.063us
 
 
 
@@ -4540,19 +4322,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.831us      2875.49%     116.831us     116.831us             1  
-                               hf_kernels_causal_conv1d        13.78%      75.282us        99.09%     541.484us     541.484us       0.000us         0.00%       5.439us       5.439us             1  
-                                         CausalConv1dFn        12.58%      68.741us        85.32%     466.202us     155.401us       0.000us         0.00%       5.439us       1.813us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.76%      26.021us        67.34%     367.980us     122.660us       4.063us       100.00%       5.439us       1.813us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        33.52%     183.175us        33.52%     183.175us     183.175us       1.376us        33.87%       1.376us       1.376us             1  
-                                       aten::empty_like         1.37%       7.489us         5.40%      29.481us       9.827us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.02%      21.992us         4.02%      21.992us       7.331us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        29.06%     158.784us        29.06%     158.784us      52.928us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.91%       4.951us         0.91%       4.951us       4.951us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.509us      2988.81%     120.509us     120.509us             1  
+                               hf_kernels_causal_conv1d        16.24%      73.950us        98.87%     450.317us     450.317us       0.000us         0.00%       5.376us       5.376us             1  
+                                         CausalConv1dFn        17.23%      78.473us        82.64%     376.367us     125.456us       0.000us         0.00%       5.376us       1.792us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.08%      23.119us        59.28%     269.974us      89.991us       4.032us       100.00%       5.376us       1.792us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
+                                Activity Buffer Request        19.95%      90.851us        19.95%      90.851us      90.851us       1.344us        33.33%       1.344us       1.344us             1  
+                                       aten::empty_like         1.73%       7.890us         6.13%      27.920us       9.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.40%      20.030us         4.40%      20.030us       6.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.25%     156.004us        34.25%     156.004us      52.001us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.13%       5.130us         1.13%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 546.435us
-Self CUDA time total: 4.063us
+Self CPU time total: 455.447us
+Self CUDA time total: 4.032us
 
 
 
@@ -4562,19 +4344,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.806us      2228.53%     119.806us     119.806us             1  
-                               hf_kernels_causal_conv1d        11.93%      76.073us        99.21%     632.507us     632.507us       0.000us         0.00%       7.200us       7.200us             1  
-                                         CausalConv1dFn        11.21%      71.480us        87.28%     556.434us     185.478us       0.000us         0.00%       7.200us       2.400us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.13%      26.361us        71.46%     455.612us     151.871us       5.376us       100.00%       7.200us       2.400us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
-                                Activity Buffer Request        42.49%     270.867us        42.49%     270.867us     270.867us       1.824us        33.93%       1.824us       1.824us             1  
-                                       aten::empty_like         1.24%       7.892us         4.60%      29.342us       9.781us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.36%      21.450us         3.36%      21.450us       7.150us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.84%     158.384us        24.84%     158.384us      52.795us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.79%       5.050us         0.79%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.767us      2334.71%     124.767us     124.767us             1  
+                               hf_kernels_causal_conv1d         3.64%      76.791us        99.75%       2.102ms       2.102ms       0.000us         0.00%       7.168us       7.168us             1  
+                                         CausalConv1dFn         3.46%      72.920us        96.11%       2.025ms     674.997us       0.000us         0.00%       7.168us       2.389us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      22.730us        91.24%       1.923ms     640.840us       5.344us       100.00%       7.168us       2.389us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.344us       100.00%       5.344us       1.781us             3  
+                                Activity Buffer Request        82.66%       1.742ms        82.66%       1.742ms       1.742ms       1.824us        34.13%       1.824us       1.824us             1  
+                                       aten::empty_like         0.40%       8.480us         1.40%      29.552us       9.851us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.00%      21.072us         1.00%      21.072us       7.024us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.51%     158.242us         7.51%     158.242us      52.747us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.220us         0.25%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 637.557us
-Self CUDA time total: 5.376us
+Self CPU time total: 2.107ms
+Self CUDA time total: 5.344us
 
 
 
@@ -4584,19 +4366,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.676us      2174.35%     119.676us     119.676us             1  
-                               hf_kernels_causal_conv1d        14.25%      74.352us        99.01%     516.513us     516.513us       0.000us         0.00%       7.392us       7.392us             1  
-                                         CausalConv1dFn        14.02%      73.122us        84.76%     442.161us     147.387us       0.000us         0.00%       7.392us       2.464us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.281us        65.18%     340.038us     113.346us       5.504us       100.00%       7.392us       2.464us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.504us       100.00%       5.504us       1.835us             3  
-                                Activity Buffer Request        30.19%     157.524us        30.19%     157.524us     157.524us       1.888us        34.30%       1.888us       1.888us             1  
-                                       aten::empty_like         1.50%       7.800us         5.56%      29.001us       9.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.06%      21.201us         4.06%      21.201us       7.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        29.95%     156.233us        29.95%     156.233us      52.078us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.99%       5.180us         0.99%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     114.399us      2127.96%     114.399us     114.399us             1  
+                               hf_kernels_causal_conv1d        16.62%      75.320us        98.88%     448.097us     448.097us       0.000us         0.00%       7.200us       7.200us             1  
+                                         CausalConv1dFn        15.04%      68.172us        82.26%     372.777us     124.259us       0.000us         0.00%       7.200us       2.400us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      22.881us        60.95%     276.214us      92.071us       5.376us       100.00%       7.200us       2.400us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
+                                Activity Buffer Request        20.71%      93.851us        20.71%      93.851us      93.851us       1.824us        33.93%       1.824us       1.824us             1  
+                                       aten::empty_like         1.68%       7.630us         6.27%      28.391us       9.464us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.58%      20.761us         4.58%      20.761us       6.920us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.19%     159.482us        35.19%     159.482us      53.161us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.12%       5.070us         1.12%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 521.693us
-Self CUDA time total: 5.504us
+Self CPU time total: 453.167us
+Self CUDA time total: 5.376us
 
 
 
@@ -4606,19 +4388,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.798us       715.63%     124.798us     124.798us             1  
-                               hf_kernels_causal_conv1d        11.85%      75.293us        99.15%     630.167us     630.167us       0.000us         0.00%      23.295us      23.295us             1  
-                                         CausalConv1dFn        11.06%      70.310us        87.30%     554.874us     184.958us       0.000us         0.00%      23.295us       7.765us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      26.540us        71.39%     453.732us     151.244us      17.439us       100.00%      23.295us       7.765us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.439us       100.00%      17.439us       5.813us             3  
-                                Activity Buffer Request        42.20%     268.237us        42.20%     268.237us     268.237us       5.856us        33.58%       5.856us       5.856us             1  
-                                       aten::empty_like         1.25%       7.951us         4.85%      30.832us      10.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.60%      22.881us         3.60%      22.881us       7.627us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        25.01%     158.955us        25.01%     158.955us      52.985us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       5.410us         0.85%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.887us       696.30%     121.887us     121.887us             1  
+                               hf_kernels_causal_conv1d         3.44%      74.640us        99.77%       2.162ms       2.162ms       0.000us         0.00%      23.361us      23.361us             1  
+                                         CausalConv1dFn         3.19%      69.031us        96.32%       2.087ms     695.668us       0.000us         0.00%      23.361us       7.787us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.10%      23.730us        91.78%       1.989ms     662.904us      17.505us       100.00%      23.361us       7.787us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.505us       100.00%      17.505us       5.835us             3  
+                                Activity Buffer Request        82.75%       1.793ms        82.75%       1.793ms       1.793ms       5.856us        33.45%       5.856us       5.856us             1  
+                                       aten::empty_like         0.40%       8.582us         1.35%      29.262us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.95%      20.680us         0.95%      20.680us       6.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.94%     172.113us         7.94%     172.113us      57.371us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.069us         0.23%       5.069us       5.069us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 635.577us
-Self CUDA time total: 17.439us
+Self CPU time total: 2.167ms
+Self CUDA time total: 17.505us
 
 
 
@@ -4628,19 +4410,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.252us       695.89%     124.252us     124.252us             1  
-                               hf_kernels_causal_conv1d        15.28%      76.213us        99.04%     494.053us     494.053us       0.000us         0.00%      23.839us      23.839us             1  
-                                         CausalConv1dFn        14.60%      72.841us        83.76%     417.840us     139.280us       0.000us         0.00%      23.839us       7.946us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.38%      26.851us        63.27%     315.607us     105.202us      17.855us       100.00%      23.839us       7.946us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.855us       100.00%      17.855us       5.952us             3  
-                                Activity Buffer Request        26.40%     131.703us        26.40%     131.703us     131.703us       5.984us        33.51%       5.984us       5.984us             1  
-                                       aten::empty_like         1.62%       8.090us         5.89%      29.392us       9.797us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.27%      21.302us         4.27%      21.302us       7.101us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.48%     157.053us        31.48%     157.053us      52.351us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.96%       4.810us         0.96%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.997us       664.91%     119.997us     119.997us             1  
+                               hf_kernels_causal_conv1d        16.46%      76.510us        98.91%     459.857us     459.857us       0.000us         0.00%      24.063us      24.063us             1  
+                                         CausalConv1dFn        14.99%      69.691us        82.45%     383.347us     127.782us       0.000us         0.00%      24.063us       8.021us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.12%      23.810us        61.53%     286.094us      95.365us      18.047us       100.00%      24.063us       8.021us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
+                                Activity Buffer Request        22.64%     105.271us        22.64%     105.271us     105.271us       6.016us        33.34%       6.016us       6.016us             1  
+                                       aten::empty_like         1.59%       7.411us         5.93%      27.562us       9.187us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.33%      20.151us         4.33%      20.151us       6.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.77%     157.013us        33.77%     157.013us      52.338us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.09%       5.080us         1.09%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 498.863us
-Self CUDA time total: 17.855us
+Self CPU time total: 464.937us
+Self CUDA time total: 18.047us
 
 
 
@@ -4650,19 +4432,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.253us       695.94%     124.253us     124.253us             1  
-                               hf_kernels_causal_conv1d        14.09%      92.581us        99.22%     652.096us     652.096us       0.000us         0.00%      23.838us      23.838us             1  
-                                         CausalConv1dFn        11.45%      75.254us        85.13%     559.515us     186.505us       0.000us         0.00%      23.838us       7.946us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.84%      25.251us        69.30%     455.481us     151.827us      17.854us       100.00%      23.838us       7.946us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.854us       100.00%      17.854us       5.951us             3  
-                                Activity Buffer Request        41.42%     272.247us        41.42%     272.247us     272.247us       5.984us        33.52%       5.984us       5.984us             1  
-                                       aten::empty_like         1.19%       7.849us         4.38%      28.780us       9.593us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.18%      20.931us         3.18%      20.931us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.04%     157.983us        24.04%     157.983us      52.661us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.78%       5.140us         0.78%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.983us       701.78%     125.983us     125.983us             1  
+                               hf_kernels_causal_conv1d         3.62%      75.400us        99.76%       2.076ms       2.076ms       0.000us         0.00%      23.968us      23.968us             1  
+                                         CausalConv1dFn         3.51%      72.963us        96.14%       2.001ms     667.008us       0.000us         0.00%      23.968us       7.989us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      24.320us        91.19%       1.898ms     632.703us      17.952us       100.00%      23.968us       7.989us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.952us       100.00%      17.952us       5.984us             3  
+                                Activity Buffer Request        82.20%       1.711ms        82.20%       1.711ms       1.711ms       6.016us        33.51%       6.016us       6.016us             1  
+                                       aten::empty_like         0.41%       8.499us         1.44%      29.950us       9.983us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.03%      21.451us         1.03%      21.451us       7.150us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.83%     162.893us         7.83%     162.893us      54.298us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       4.969us         0.24%       4.969us       4.969us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 657.236us
-Self CUDA time total: 17.854us
+Self CPU time total: 2.081ms
+Self CUDA time total: 17.952us
 
 
 
@@ -4672,19 +4454,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.982us       651.61%     121.982us     121.982us             1  
-                               hf_kernels_causal_conv1d        16.26%      76.273us        99.00%     464.343us     464.343us       0.000us         0.00%      25.088us      25.088us             1  
-                                         CausalConv1dFn        15.20%      71.302us        82.74%     388.070us     129.357us       0.000us         0.00%      25.088us       8.363us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.49%      25.750us        61.15%     286.808us      95.603us      18.720us       100.00%      25.088us       8.363us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.720us       100.00%      18.720us       6.240us             3  
-                                Activity Buffer Request        22.13%     103.813us        22.13%     103.813us     103.813us       6.368us        34.02%       6.368us       6.368us             1  
-                                       aten::empty_like         1.75%       8.210us         6.39%      29.960us       9.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.64%      21.750us         4.64%      21.750us       7.250us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.53%     157.245us        33.53%     157.245us      52.415us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.00%       4.680us         1.00%       4.680us       4.680us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.901us       639.40%     119.901us     119.901us             1  
+                               hf_kernels_causal_conv1d        11.47%      73.600us        99.21%     636.820us     636.820us       0.000us         0.00%      25.088us      25.088us             1  
+                                         CausalConv1dFn        11.28%      72.380us        87.74%     563.220us     187.740us       0.000us         0.00%      25.088us       8.363us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.65%      23.431us        72.11%     462.887us     154.296us      18.752us       100.00%      25.088us       8.363us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.752us       100.00%      18.752us       6.251us             3  
+                                Activity Buffer Request        43.62%     280.014us        43.62%     280.014us     280.014us       6.336us        33.79%       6.336us       6.336us             1  
+                                       aten::empty_like         1.22%       7.832us         4.35%      27.953us       9.318us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.13%      20.121us         3.13%      20.121us       6.707us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.84%     159.442us        24.84%     159.442us      53.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.79%       5.080us         0.79%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 469.023us
-Self CUDA time total: 18.720us
+Self CPU time total: 641.900us
+Self CUDA time total: 18.752us
 
 
 
@@ -4694,19 +4476,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         4.40%      80.973us        99.73%       1.837ms       1.837ms       0.000us         0.00%     162.749us     162.749us             1  
-                                         CausalConv1dFn         4.14%      76.301us        95.33%       1.756ms     585.285us       0.000us         0.00%     162.749us      54.250us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.45%      26.730us        89.50%       1.648ms     549.474us      97.918us       100.00%     162.749us      54.250us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     141.950us       144.97%     141.950us     141.950us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.918us       100.00%      97.918us      32.639us             3  
-                                Activity Buffer Request        78.99%       1.455ms        78.99%       1.455ms       1.455ms      64.831us        66.21%      64.831us      64.831us             1  
-                                       aten::empty_like         0.45%       8.340us         1.69%      31.131us      10.377us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.24%      22.791us         1.24%      22.791us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.06%     166.885us         9.06%     166.885us      55.628us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.980us         0.27%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        11.42%      73.310us        99.16%     636.780us     636.780us       0.000us         0.00%     162.591us     162.591us             1  
+                                         CausalConv1dFn        11.12%      71.382us        87.74%     563.470us     187.823us       0.000us         0.00%     162.591us      54.197us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.58%      22.989us        72.14%     463.287us     154.429us      97.631us       100.00%     162.591us      54.197us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.208us       133.37%     130.208us     130.208us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.631us       100.00%      97.631us      32.544us             3  
+                                Activity Buffer Request        43.38%     278.604us        43.38%     278.604us     278.604us      64.960us        66.54%      64.960us      64.960us             1  
+                                       aten::empty_like         1.24%       7.950us         4.48%      28.801us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.25%      20.851us         3.25%      20.851us       6.950us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.18%     161.694us        25.18%     161.694us      53.898us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.84%       5.420us         0.84%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.842ms
-Self CUDA time total: 97.918us
+Self CPU time total: 642.200us
+Self CUDA time total: 97.631us
 
 
 
@@ -4716,19 +4498,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        16.07%      76.871us        98.94%     473.172us     473.172us       0.000us         0.00%     163.803us     163.803us             1  
-                                         CausalConv1dFn        14.96%      71.532us        82.87%     396.301us     132.100us       0.000us         0.00%     163.803us      54.601us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.75%      27.501us        61.56%     294.418us      98.139us      98.685us       100.00%     163.803us      54.601us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.180us       134.95%     133.180us     133.180us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.685us       100.00%      98.685us      32.895us             3  
-                                Activity Buffer Request        21.65%     103.543us        21.65%     103.543us     103.543us      65.118us        65.99%      65.118us      65.118us             1  
-                                       aten::empty_like         1.52%       7.251us         6.35%      30.351us      10.117us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.83%      23.100us         4.83%      23.100us       7.700us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.16%     163.374us        34.16%     163.374us      54.458us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.06%       5.061us         1.06%       5.061us       5.061us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        13.89%      72.060us        98.98%     513.378us     513.378us       0.000us         0.00%     163.263us     163.263us             1  
+                                         CausalConv1dFn        13.96%      72.421us        85.08%     441.318us     147.106us       0.000us         0.00%     163.263us      54.421us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.45%      23.099us        65.49%     339.676us     113.225us      98.623us       100.00%     163.263us      54.421us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.111us       131.93%     130.111us     130.111us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.623us       100.00%      98.623us      32.874us             3  
+                                Activity Buffer Request        30.19%     156.612us        30.19%     156.612us     156.612us      64.640us        65.54%      64.640us      64.640us             1  
+                                       aten::empty_like         1.62%       8.391us         5.63%      29.221us       9.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.02%      20.830us         4.02%      20.830us       6.943us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.84%     159.965us        30.84%     159.965us      53.322us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.02%       5.310us         1.02%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 478.233us
-Self CUDA time total: 98.685us
+Self CPU time total: 518.688us
+Self CUDA time total: 98.623us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4738,7 +4520,7 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.04  True
 hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
@@ -4760,13 +4542,14 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 9%|�� | 1/11 [00:00<00:01, 9.42it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:00, 4.98it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 7.98it/s]
+Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.51it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.51it/s]

Artifacts:

causal_conv1d.jsonl