diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - ; -} - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - - { - % endif % -} - - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:00:25 2025 +Mon Nov 10 21:57:49 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 33C P0 79W / 350W | 0MiB / 46068MiB | 11% Default | +| N/A 27C P0 77W / 350W | 0MiB / 46068MiB | 18% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4156,7 +3938,7 @@ Cell: nv | 0.21s ▼ output ▶ uv-logs | -Cell: benchmark | 9.11s +Cell: benchmark | 10.37s | Raw @@ -4210,19 +3992,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 180.703us 4446.43% 180.703us 180.703us 1 - hf_kernels_causal_conv1d 8.48% 160.534us 99.62% 1.886ms 1.886ms 0.000us 0.00% 5.504us 5.504us 1 - CausalConv1dFn 6.47% 122.423us 91.15% 1.726ms 575.261us 0.000us 0.00% 5.504us 1.835us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.51% 28.612us 80.84% 1.531ms 510.207us 4.064us 100.00% 5.504us 1.835us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3 - Activity Buffer Request 76.71% 1.452ms 76.71% 1.452ms 1.452ms 1.440us 35.43% 1.440us 1.440us 1 - aten::empty_like 1.07% 20.220us 3.84% 72.741us 24.247us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.77% 52.521us 2.77% 52.521us 17.507us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.62% 49.571us 2.62% 49.571us 16.524us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.38% 7.101us 0.38% 7.101us 7.101us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 156.321us 3758.62% 156.321us 156.321us 1 + hf_kernels_causal_conv1d 6.87% 159.072us 99.36% 2.300ms 2.300ms 0.000us 0.00% 5.599us 5.599us 1 + CausalConv1dFn 4.82% 111.622us 92.49% 2.141ms 713.785us 0.000us 0.00% 5.599us 1.866us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.19% 27.462us 84.76% 1.962ms 654.127us 4.159us 100.00% 5.599us 1.866us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.159us 100.00% 4.159us 1.386us 3 + Activity Buffer Request 81.39% 1.884ms 81.39% 1.884ms 1.884ms 1.440us 34.62% 1.440us 1.440us 1 + aten::empty_like 0.94% 21.650us 2.91% 67.351us 22.450us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.97% 45.701us 1.97% 45.701us 15.234us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.18% 50.500us 2.18% 50.500us 16.833us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.64% 14.811us 0.64% 14.811us 14.811us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.893ms -Self CUDA time total: 4.064us +Self CPU time total: 2.315ms +Self CUDA time total: 4.159us @@ -4232,19 +4014,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.791us 3331.33% 125.791us 125.791us 1 - hf_kernels_causal_conv1d 5.58% 96.392us 99.64% 1.721ms 1.721ms 0.000us 0.00% 5.056us 5.056us 1 - CausalConv1dFn 4.40% 76.074us 94.06% 1.625ms 541.671us 0.000us 0.00% 5.056us 1.685us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.52% 26.231us 87.95% 1.519ms 506.473us 3.776us 100.00% 5.056us 1.685us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3 - Activity Buffer Request 84.56% 1.461ms 84.56% 1.461ms 1.461ms 1.280us 33.90% 1.280us 1.280us 1 - aten::empty_like 0.44% 7.590us 1.71% 29.520us 9.840us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.27% 21.930us 1.27% 21.930us 7.310us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.87% 32.290us 1.87% 32.290us 10.763us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.36% 6.200us 0.36% 6.200us 6.200us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.455us 3297.41% 123.455us 123.455us 1 + hf_kernels_causal_conv1d 4.13% 83.101us 99.73% 2.009ms 2.009ms 0.000us 0.00% 4.992us 4.992us 1 + CausalConv1dFn 3.66% 73.760us 95.61% 1.926ms 641.917us 0.000us 0.00% 4.992us 1.664us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.15% 23.071us 90.47% 1.822ms 607.420us 3.744us 100.00% 4.992us 1.664us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.744us 100.00% 3.744us 1.248us 3 + Activity Buffer Request 87.83% 1.769ms 87.83% 1.769ms 1.769ms 1.248us 33.33% 1.248us 1.248us 1 + aten::empty_like 0.39% 7.860us 1.48% 29.730us 9.910us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.09% 21.870us 1.09% 21.870us 7.290us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.49% 30.082us 1.49% 30.082us 10.027us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 5.421us 0.27% 5.421us 5.421us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.728ms -Self CUDA time total: 3.776us +Self CPU time total: 2.014ms +Self CUDA time total: 3.744us @@ -4254,19 +4036,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.758us 3330.46% 125.758us 125.758us 1 - hf_kernels_causal_conv1d 5.23% 90.742us 99.66% 1.729ms 1.729ms 0.000us 0.00% 5.056us 5.056us 1 - CausalConv1dFn 4.39% 76.092us 94.43% 1.638ms 546.081us 0.000us 0.00% 5.056us 1.685us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.50% 26.031us 88.31% 1.532ms 510.660us 3.776us 100.00% 5.056us 1.685us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3 - Activity Buffer Request 84.98% 1.474ms 84.98% 1.474ms 1.474ms 1.280us 33.90% 1.280us 1.280us 1 - aten::empty_like 0.47% 8.201us 1.74% 30.171us 10.057us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.27% 21.970us 1.27% 21.970us 7.323us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.83% 31.671us 1.83% 31.671us 10.557us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.34% 5.850us 0.34% 5.850us 5.850us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.263us 3185.44% 119.263us 119.263us 1 + hf_kernels_causal_conv1d 3.91% 78.640us 99.72% 2.003ms 2.003ms 0.000us 0.00% 4.992us 4.992us 1 + CausalConv1dFn 3.57% 71.661us 95.80% 1.925ms 641.537us 0.000us 0.00% 4.992us 1.664us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.13% 22.781us 90.75% 1.823ms 607.693us 3.744us 100.00% 4.992us 1.664us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.744us 100.00% 3.744us 1.248us 3 + Activity Buffer Request 88.14% 1.771ms 88.14% 1.771ms 1.771ms 1.248us 33.33% 1.248us 1.248us 1 + aten::empty_like 0.41% 8.160us 1.49% 29.872us 9.957us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.08% 21.712us 1.08% 21.712us 7.237us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.48% 29.670us 1.48% 29.670us 9.890us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.28% 5.669us 0.28% 5.669us 5.669us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.735ms -Self CUDA time total: 3.776us +Self CPU time total: 2.009ms +Self CUDA time total: 3.744us @@ -4276,19 +4058,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.584us 3350.42% 127.584us 127.584us 1 - hf_kernels_causal_conv1d 4.53% 88.983us 99.75% 1.962ms 1.962ms 0.000us 0.00% 5.088us 5.088us 1 - CausalConv1dFn 3.93% 77.252us 95.23% 1.873ms 624.219us 0.000us 0.00% 5.088us 1.696us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.36% 26.710us 89.83% 1.766ms 588.805us 3.808us 100.00% 5.088us 1.696us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.808us 100.00% 3.808us 1.269us 3 - Activity Buffer Request 74.34% 1.462ms 74.34% 1.462ms 1.462ms 1.280us 33.61% 1.280us 1.280us 1 - aten::empty_like 0.41% 8.060us 1.47% 28.990us 9.663us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.06% 20.930us 1.06% 20.930us 6.977us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 14.13% 277.777us 14.13% 277.777us 92.592us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.25% 4.831us 0.25% 4.831us 4.831us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.790us 3253.81% 121.790us 121.790us 1 + hf_kernels_causal_conv1d 3.48% 76.970us 99.77% 2.208ms 2.208ms 0.000us 0.00% 4.991us 4.991us 1 + CausalConv1dFn 3.33% 73.753us 96.30% 2.131ms 710.368us 0.000us 0.00% 4.991us 1.664us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.03% 22.770us 91.66% 2.029ms 676.184us 3.743us 100.00% 4.991us 1.664us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.743us 100.00% 3.743us 1.248us 3 + Activity Buffer Request 81.47% 1.803ms 81.47% 1.803ms 1.803ms 1.248us 33.34% 1.248us 1.248us 1 + aten::empty_like 0.36% 7.858us 1.30% 28.800us 9.600us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.95% 20.942us 0.95% 20.942us 6.981us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.17% 202.863us 9.17% 202.863us 67.621us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.23% 4.991us 0.23% 4.991us 4.991us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.966ms -Self CUDA time total: 3.808us +Self CPU time total: 2.213ms +Self CUDA time total: 3.743us @@ -4298,19 +4080,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.686us 2639.84% 126.686us 126.686us 1 - hf_kernels_causal_conv1d 4.55% 87.622us 99.73% 1.920ms 1.920ms 0.000us 0.00% 6.430us 6.430us 1 - CausalConv1dFn 3.92% 75.482us 95.18% 1.832ms 610.789us 0.000us 0.00% 6.430us 2.143us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.44% 27.663us 89.66% 1.726ms 575.372us 4.799us 100.00% 6.430us 2.143us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.799us 100.00% 4.799us 1.600us 3 - Activity Buffer Request 74.49% 1.434ms 74.49% 1.434ms 1.434ms 1.631us 33.99% 1.631us 1.631us 1 - aten::empty_like 0.42% 8.140us 1.60% 30.770us 10.257us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.18% 22.630us 1.18% 22.630us 7.543us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 13.74% 264.526us 13.74% 264.526us 88.175us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.27% 5.120us 0.27% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.073us 2547.57% 123.073us 123.073us 1 + hf_kernels_causal_conv1d 3.82% 79.680us 99.75% 2.083ms 2.083ms 0.000us 0.00% 6.463us 6.463us 1 + CausalConv1dFn 3.53% 73.692us 95.93% 2.003ms 667.744us 0.000us 0.00% 6.463us 2.154us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.17% 24.371us 90.98% 1.900ms 633.257us 4.831us 100.00% 6.463us 2.154us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.831us 100.00% 4.831us 1.610us 3 + Activity Buffer Request 81.73% 1.707ms 81.73% 1.707ms 1.707ms 1.632us 33.78% 1.632us 1.632us 1 + aten::empty_like 0.42% 8.791us 1.43% 29.771us 9.924us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.00% 20.980us 1.00% 20.980us 6.993us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.08% 168.682us 8.08% 168.682us 56.227us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 5.250us 0.25% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.925ms -Self CUDA time total: 4.799us +Self CPU time total: 2.088ms +Self CUDA time total: 4.831us @@ -4320,19 +4102,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.083us 2423.58% 117.083us 117.083us 1 - hf_kernels_causal_conv1d 12.24% 83.203us 99.28% 674.957us 674.957us 0.000us 0.00% 6.463us 6.463us 1 - CausalConv1dFn 10.43% 70.911us 87.04% 591.754us 197.251us 0.000us 0.00% 6.463us 2.154us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.93% 26.710us 72.18% 490.682us 163.561us 4.831us 100.00% 6.463us 2.154us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.831us 100.00% 4.831us 1.610us 3 - Activity Buffer Request 32.42% 220.416us 32.42% 220.416us 220.416us 1.632us 33.78% 1.632us 1.632us 1 - aten::empty_like 1.07% 7.270us 4.44% 30.161us 10.054us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.37% 22.891us 3.37% 22.891us 7.630us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 35.83% 243.556us 35.83% 243.556us 81.185us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.72% 4.870us 0.72% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 113.883us 2373.55% 113.883us 113.883us 1 + hf_kernels_causal_conv1d 15.03% 75.250us 99.01% 495.717us 495.717us 0.000us 0.00% 6.430us 6.430us 1 + CausalConv1dFn 13.70% 68.601us 83.98% 420.467us 140.156us 0.000us 0.00% 6.430us 2.143us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.03% 25.190us 64.69% 323.874us 107.958us 4.798us 100.00% 6.430us 2.143us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.798us 100.00% 4.798us 1.599us 3 + Activity Buffer Request 28.01% 140.222us 28.01% 140.222us 140.222us 1.632us 34.01% 1.632us 1.632us 1 + aten::empty_like 1.45% 7.260us 5.59% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.14% 20.732us 4.14% 20.732us 6.911us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 31.65% 158.462us 31.65% 158.462us 52.821us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.99% 4.940us 0.99% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 679.827us -Self CUDA time total: 4.831us +Self CPU time total: 500.657us +Self CUDA time total: 4.798us @@ -4342,19 +4124,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.381us 1167.35% 124.381us 124.381us 1 - hf_kernels_causal_conv1d 4.48% 85.542us 99.75% 1.904ms 1.904ms 0.000us 0.00% 14.271us 14.271us 1 - CausalConv1dFn 3.83% 73.182us 95.27% 1.819ms 606.282us 0.000us 0.00% 14.271us 4.757us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 26.960us 89.88% 1.716ms 571.988us 10.655us 100.00% 14.271us 4.757us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.655us 100.00% 10.655us 3.552us 3 - Activity Buffer Request 76.01% 1.451ms 76.01% 1.451ms 1.451ms 3.616us 33.94% 3.616us 3.616us 1 - aten::empty_like 0.43% 8.120us 1.56% 29.700us 9.900us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.13% 21.580us 1.13% 21.580us 7.193us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 12.45% 237.787us 12.45% 237.787us 79.262us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.25% 4.860us 0.25% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.365us 1148.32% 122.365us 122.365us 1 + hf_kernels_causal_conv1d 3.51% 76.530us 99.77% 2.176ms 2.176ms 0.000us 0.00% 14.208us 14.208us 1 + CausalConv1dFn 3.29% 71.713us 96.26% 2.099ms 699.771us 0.000us 0.00% 14.208us 4.736us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.11% 24.170us 91.65% 1.999ms 666.274us 10.656us 100.00% 14.208us 4.736us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.656us 100.00% 10.656us 3.552us 3 + Activity Buffer Request 82.90% 1.808ms 82.90% 1.808ms 1.808ms 3.552us 33.33% 3.552us 3.552us 1 + aten::empty_like 0.37% 8.070us 1.32% 28.780us 9.593us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.95% 20.710us 0.95% 20.710us 6.903us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.64% 166.713us 7.64% 166.713us 55.571us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.23% 5.051us 0.23% 5.051us 5.051us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.909ms -Self CUDA time total: 10.655us +Self CPU time total: 2.181ms +Self CUDA time total: 10.656us @@ -4364,19 +4146,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.652us 1120.72% 122.652us 122.652us 1 - hf_kernels_causal_conv1d 12.91% 86.303us 99.27% 663.588us 663.588us 0.000us 0.00% 14.624us 14.624us 1 - CausalConv1dFn 10.74% 71.821us 86.36% 577.285us 192.428us 0.000us 0.00% 14.624us 4.875us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.81% 25.480us 71.21% 476.023us 158.674us 10.944us 100.00% 14.624us 4.875us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 100.00% 10.944us 3.648us 3 - Activity Buffer Request 32.82% 219.426us 32.82% 219.426us 219.426us 3.680us 33.63% 3.680us 3.680us 1 - aten::empty_like 1.14% 7.591us 4.40% 29.441us 9.814us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.27% 21.850us 3.27% 21.850us 7.283us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.57% 231.117us 34.57% 231.117us 77.039us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.73% 4.900us 0.73% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 115.676us 1057.08% 115.676us 115.676us 1 + hf_kernels_causal_conv1d 15.90% 75.141us 98.97% 467.777us 467.777us 0.000us 0.00% 14.654us 14.654us 1 + CausalConv1dFn 14.89% 70.359us 83.07% 392.636us 130.879us 0.000us 0.00% 14.654us 4.885us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.95% 23.391us 62.24% 294.186us 98.062us 10.943us 100.00% 14.654us 4.885us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.943us 100.00% 10.943us 3.648us 3 + Activity Buffer Request 23.54% 111.281us 23.54% 111.281us 111.281us 3.711us 33.91% 3.711us 3.711us 1 + aten::empty_like 1.66% 7.830us 5.94% 28.091us 9.364us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.29% 20.261us 4.29% 20.261us 6.754us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.75% 159.514us 33.75% 159.514us 53.171us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.03% 4.890us 1.03% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 668.488us -Self CUDA time total: 10.944us +Self CPU time total: 472.667us +Self CUDA time total: 10.943us @@ -4386,19 +4168,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.430us 1181.43% 130.430us 130.430us 1 - hf_kernels_causal_conv1d 4.23% 79.341us 99.73% 1.871ms 1.871ms 0.000us 0.00% 14.784us 14.784us 1 - CausalConv1dFn 4.03% 75.521us 95.50% 1.792ms 597.206us 0.000us 0.00% 14.784us 4.928us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.43% 26.810us 89.82% 1.685ms 561.675us 11.040us 100.00% 14.784us 4.928us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 100.00% 11.040us 3.680us 3 - Activity Buffer Request 77.07% 1.446ms 77.07% 1.446ms 1.446ms 3.744us 33.91% 3.744us 3.744us 1 - aten::empty_like 0.44% 8.272us 1.66% 31.072us 10.357us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.22% 22.800us 1.22% 22.800us 7.600us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 11.32% 212.286us 11.32% 212.286us 70.762us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.27% 5.130us 0.27% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.422us 1124.47% 123.422us 123.422us 1 + hf_kernels_causal_conv1d 3.69% 77.100us 99.75% 2.084ms 2.084ms 0.000us 0.00% 14.656us 14.656us 1 + CausalConv1dFn 3.52% 73.471us 96.06% 2.007ms 668.988us 0.000us 0.00% 14.656us 4.885us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.13% 23.660us 90.70% 1.895ms 631.647us 10.976us 100.00% 14.656us 4.885us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 100.00% 10.976us 3.659us 3 + Activity Buffer Request 81.81% 1.709ms 81.81% 1.709ms 1.709ms 3.680us 33.53% 3.680us 3.680us 1 + aten::empty_like 0.81% 17.020us 1.85% 38.551us 12.850us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.03% 21.531us 1.03% 21.531us 7.177us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.76% 162.104us 7.76% 162.104us 54.035us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 5.260us 0.25% 5.260us 5.260us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.876ms -Self CUDA time total: 11.040us +Self CPU time total: 2.089ms +Self CUDA time total: 10.976us @@ -4408,19 +4190,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.097us 1060.18% 120.097us 120.097us 1 - hf_kernels_causal_conv1d 13.35% 76.301us 99.17% 566.674us 566.674us 0.000us 0.00% 15.168us 15.168us 1 - CausalConv1dFn 12.80% 73.153us 85.81% 490.373us 163.458us 0.000us 0.00% 15.168us 5.056us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.71% 26.911us 68.00% 388.569us 129.523us 11.328us 100.00% 15.168us 5.056us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.328us 100.00% 11.328us 3.776us 3 - Activity Buffer Request 34.49% 197.075us 34.49% 197.075us 197.075us 3.840us 33.90% 3.840us 3.840us 1 - aten::empty_like 1.29% 7.379us 5.01% 28.651us 9.550us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.72% 21.272us 3.72% 21.272us 7.091us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 28.80% 164.583us 28.80% 164.583us 54.861us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.83% 4.760us 0.83% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.952us 1044.29% 117.952us 117.952us 1 + hf_kernels_causal_conv1d 16.01% 73.960us 98.90% 456.837us 456.837us 0.000us 0.00% 15.071us 15.071us 1 + CausalConv1dFn 15.53% 71.741us 82.89% 382.877us 127.626us 0.000us 0.00% 15.071us 5.024us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.93% 22.791us 61.20% 282.685us 94.228us 11.295us 100.00% 15.071us 5.024us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.295us 100.00% 11.295us 3.765us 3 + Activity Buffer Request 21.70% 100.232us 21.70% 100.232us 100.232us 3.776us 33.43% 3.776us 3.776us 1 + aten::empty_like 1.73% 7.970us 6.16% 28.451us 9.484us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.43% 20.481us 4.43% 20.481us 6.827us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.57% 159.662us 34.57% 159.662us 53.221us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.10% 5.060us 1.10% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 571.434us -Self CUDA time total: 11.328us +Self CPU time total: 461.897us +Self CUDA time total: 11.295us @@ -4430,19 +4212,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.919us 265.71% 133.919us 133.919us 1 - hf_kernels_causal_conv1d 4.38% 80.552us 99.73% 1.836ms 1.836ms 0.000us 0.00% 83.873us 83.873us 1 - CausalConv1dFn 4.09% 75.353us 95.35% 1.755ms 585.145us 0.000us 0.00% 83.873us 27.958us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.33% 24.410us 89.50% 1.648ms 549.264us 50.401us 100.00% 83.873us 27.958us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.401us 100.00% 50.401us 16.800us 3 - Activity Buffer Request 79.01% 1.455ms 79.01% 1.455ms 1.455ms 33.472us 66.41% 33.472us 33.472us 1 - aten::empty_like 0.45% 8.369us 1.75% 32.290us 10.763us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.30% 23.921us 1.30% 23.921us 7.974us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 9.17% 168.764us 9.17% 168.764us 56.255us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.27% 5.020us 0.27% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.158us 256.57% 128.158us 128.158us 1 + hf_kernels_causal_conv1d 3.51% 75.280us 99.75% 2.140ms 2.140ms 0.000us 0.00% 83.102us 83.102us 1 + CausalConv1dFn 3.36% 72.172us 96.24% 2.065ms 688.218us 0.000us 0.00% 83.102us 27.701us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.14% 24.540us 91.55% 1.964ms 654.657us 49.951us 100.00% 83.102us 27.701us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 49.951us 100.00% 49.951us 16.650us 3 + Activity Buffer Request 82.86% 1.778ms 82.86% 1.778ms 1.778ms 33.151us 66.37% 33.151us 33.151us 1 + aten::empty_like 0.37% 7.920us 1.33% 28.510us 9.503us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.96% 20.590us 0.96% 20.590us 6.863us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.54% 161.824us 7.54% 161.824us 53.941us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 5.290us 0.25% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.841ms -Self CUDA time total: 50.401us +Self CPU time total: 2.145ms +Self CUDA time total: 49.951us @@ -4452,19 +4234,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 131.005us 256.03% 131.005us 131.005us 1 - hf_kernels_causal_conv1d 11.69% 77.241us 99.25% 655.717us 655.717us 0.000us 0.00% 85.534us 85.534us 1 - CausalConv1dFn 10.97% 72.503us 87.56% 578.476us 192.825us 0.000us 0.00% 85.534us 28.511us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.89% 25.692us 71.76% 474.103us 158.034us 51.167us 100.00% 85.534us 28.511us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 51.167us 100.00% 51.167us 17.056us 3 - Activity Buffer Request 43.08% 284.587us 43.08% 284.587us 284.587us 34.367us 67.17% 34.367us 34.367us 1 - aten::empty_like 1.14% 7.549us 4.82% 31.870us 10.623us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.68% 24.321us 3.68% 24.321us 8.107us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 24.80% 163.824us 24.80% 163.824us 54.608us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.75% 4.929us 0.75% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.310us 261.10% 121.310us 121.310us 1 + hf_kernels_causal_conv1d 16.42% 74.560us 98.88% 448.987us 448.987us 0.000us 0.00% 75.933us 75.933us 1 + CausalConv1dFn 15.28% 69.392us 82.46% 374.427us 124.809us 0.000us 0.00% 75.933us 25.311us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.01% 22.740us 60.80% 276.074us 92.025us 46.462us 100.00% 75.933us 25.311us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 46.462us 100.00% 46.462us 15.487us 3 + Activity Buffer Request 21.27% 96.581us 21.27% 96.581us 96.581us 29.471us 63.43% 29.471us 29.471us 1 + aten::empty_like 1.63% 7.411us 6.38% 28.961us 9.654us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.75% 21.550us 4.75% 21.550us 7.183us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.52% 156.753us 34.52% 156.753us 52.251us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.12% 5.090us 1.12% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 660.646us -Self CUDA time total: 51.167us +Self CPU time total: 454.077us +Self CUDA time total: 46.462us @@ -4474,19 +4256,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.686us 3040.89% 118.686us 118.686us 1 - hf_kernels_causal_conv1d 11.60% 73.750us 99.24% 631.216us 631.216us 0.000us 0.00% 5.183us 5.183us 1 - CausalConv1dFn 11.30% 71.845us 87.65% 557.466us 185.822us 0.000us 0.00% 5.183us 1.728us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.22% 26.861us 71.87% 457.101us 152.367us 3.903us 100.00% 5.183us 1.728us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.903us 100.00% 3.903us 1.301us 3 - Activity Buffer Request 42.38% 269.577us 42.38% 269.577us 269.577us 1.280us 32.80% 1.280us 1.280us 1 - aten::empty_like 1.23% 7.810us 4.48% 28.520us 9.507us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.26% 20.710us 3.26% 20.710us 6.903us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 25.26% 160.663us 25.26% 160.663us 53.554us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.76% 4.821us 0.76% 4.821us 4.821us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.254us 3312.35% 128.254us 128.254us 1 + hf_kernels_causal_conv1d 3.31% 74.540us 99.77% 2.245ms 2.245ms 0.000us 0.00% 5.120us 5.120us 1 + CausalConv1dFn 3.41% 76.802us 96.46% 2.170ms 723.418us 0.000us 0.00% 5.120us 1.707us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.08% 24.209us 91.78% 2.065ms 688.374us 3.872us 100.00% 5.120us 1.707us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.872us 100.00% 3.872us 1.291us 3 + Activity Buffer Request 83.69% 1.883ms 83.69% 1.883ms 1.883ms 1.248us 32.23% 1.248us 1.248us 1 + aten::empty_like 0.34% 7.679us 1.26% 28.331us 9.444us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.92% 20.652us 0.92% 20.652us 6.884us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.01% 157.803us 7.01% 157.803us 52.601us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.23% 5.180us 0.23% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 636.037us -Self CUDA time total: 3.903us +Self CPU time total: 2.250ms +Self CUDA time total: 3.872us @@ -4496,19 +4278,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.221us 3029.76% 120.221us 120.221us 1 - hf_kernels_causal_conv1d 13.01% 75.082us 99.09% 571.775us 571.775us 0.000us 0.00% 5.248us 5.248us 1 - CausalConv1dFn 12.35% 71.241us 86.08% 496.693us 165.564us 0.000us 0.00% 5.248us 1.749us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.88% 28.181us 68.58% 395.720us 131.907us 3.968us 100.00% 5.248us 1.749us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3 - Activity Buffer Request 36.26% 209.246us 36.26% 209.246us 209.246us 1.280us 32.26% 1.280us 1.280us 1 - aten::empty_like 1.42% 8.172us 5.15% 29.732us 9.911us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.74% 21.560us 3.74% 21.560us 7.187us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 27.43% 158.293us 27.43% 158.293us 52.764us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.91% 5.270us 0.91% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.470us 3059.11% 117.470us 117.470us 1 + hf_kernels_causal_conv1d 16.52% 75.490us 98.91% 451.907us 451.907us 0.000us 0.00% 5.056us 5.056us 1 + CausalConv1dFn 15.55% 71.061us 82.39% 376.417us 125.472us 0.000us 0.00% 5.056us 1.685us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.27% 24.090us 60.40% 275.984us 91.995us 3.840us 100.00% 5.056us 1.685us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.840us 100.00% 3.840us 1.280us 3 + Activity Buffer Request 20.75% 94.821us 20.75% 94.821us 94.821us 1.216us 31.67% 1.216us 1.216us 1 + aten::empty_like 1.80% 8.242us 6.43% 29.372us 9.791us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.62% 21.130us 4.62% 21.130us 7.043us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.38% 157.073us 34.38% 157.073us 52.358us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.09% 4.990us 1.09% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 577.045us -Self CUDA time total: 3.968us +Self CPU time total: 456.897us +Self CUDA time total: 3.840us @@ -4518,19 +4300,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.374us 2843.36% 117.374us 117.374us 1 - hf_kernels_causal_conv1d 14.38% 74.792us 98.97% 514.843us 514.843us 0.000us 0.00% 5.504us 5.504us 1 - CausalConv1dFn 13.25% 68.940us 84.59% 440.051us 146.684us 0.000us 0.00% 5.504us 1.835us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.99% 25.981us 65.51% 340.779us 113.593us 4.128us 100.00% 5.504us 1.835us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3 - Activity Buffer Request 29.84% 155.214us 29.84% 155.214us 155.214us 1.376us 33.33% 1.376us 1.376us 1 - aten::empty_like 1.55% 8.080us 5.83% 30.332us 10.111us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.28% 22.252us 4.28% 22.252us 7.417us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 30.68% 159.584us 30.68% 159.584us 53.195us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.03% 5.380us 1.03% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.191us 2958.18% 120.191us 120.191us 1 + hf_kernels_causal_conv1d 3.64% 78.360us 99.76% 2.149ms 2.149ms 0.000us 0.00% 5.406us 5.406us 1 + CausalConv1dFn 3.37% 72.531us 96.13% 2.071ms 690.275us 0.000us 0.00% 5.406us 1.802us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.05% 22.591us 91.41% 1.969ms 656.417us 4.063us 100.00% 5.406us 1.802us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3 + Activity Buffer Request 83.09% 1.790ms 83.09% 1.790ms 1.790ms 1.343us 33.05% 1.343us 1.343us 1 + aten::empty_like 0.37% 8.020us 1.35% 29.041us 9.680us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.98% 21.021us 0.98% 21.021us 7.007us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.27% 156.703us 7.27% 156.703us 52.234us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.24% 5.100us 0.24% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 520.223us -Self CUDA time total: 4.128us +Self CPU time total: 2.154ms +Self CUDA time total: 4.063us @@ -4540,19 +4322,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 116.831us 2875.49% 116.831us 116.831us 1 - hf_kernels_causal_conv1d 13.78% 75.282us 99.09% 541.484us 541.484us 0.000us 0.00% 5.439us 5.439us 1 - CausalConv1dFn 12.58% 68.741us 85.32% 466.202us 155.401us 0.000us 0.00% 5.439us 1.813us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.76% 26.021us 67.34% 367.980us 122.660us 4.063us 100.00% 5.439us 1.813us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3 - Activity Buffer Request 33.52% 183.175us 33.52% 183.175us 183.175us 1.376us 33.87% 1.376us 1.376us 1 - aten::empty_like 1.37% 7.489us 5.40% 29.481us 9.827us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.02% 21.992us 4.02% 21.992us 7.331us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 29.06% 158.784us 29.06% 158.784us 52.928us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.91% 4.951us 0.91% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.509us 2988.81% 120.509us 120.509us 1 + hf_kernels_causal_conv1d 16.24% 73.950us 98.87% 450.317us 450.317us 0.000us 0.00% 5.376us 5.376us 1 + CausalConv1dFn 17.23% 78.473us 82.64% 376.367us 125.456us 0.000us 0.00% 5.376us 1.792us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.08% 23.119us 59.28% 269.974us 89.991us 4.032us 100.00% 5.376us 1.792us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3 + Activity Buffer Request 19.95% 90.851us 19.95% 90.851us 90.851us 1.344us 33.33% 1.344us 1.344us 1 + aten::empty_like 1.73% 7.890us 6.13% 27.920us 9.307us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.40% 20.030us 4.40% 20.030us 6.677us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.25% 156.004us 34.25% 156.004us 52.001us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.13% 5.130us 1.13% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 546.435us -Self CUDA time total: 4.063us +Self CPU time total: 455.447us +Self CUDA time total: 4.032us @@ -4562,19 +4344,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.806us 2228.53% 119.806us 119.806us 1 - hf_kernels_causal_conv1d 11.93% 76.073us 99.21% 632.507us 632.507us 0.000us 0.00% 7.200us 7.200us 1 - CausalConv1dFn 11.21% 71.480us 87.28% 556.434us 185.478us 0.000us 0.00% 7.200us 2.400us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.13% 26.361us 71.46% 455.612us 151.871us 5.376us 100.00% 7.200us 2.400us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.376us 100.00% 5.376us 1.792us 3 - Activity Buffer Request 42.49% 270.867us 42.49% 270.867us 270.867us 1.824us 33.93% 1.824us 1.824us 1 - aten::empty_like 1.24% 7.892us 4.60% 29.342us 9.781us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.36% 21.450us 3.36% 21.450us 7.150us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 24.84% 158.384us 24.84% 158.384us 52.795us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.79% 5.050us 0.79% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.767us 2334.71% 124.767us 124.767us 1 + hf_kernels_causal_conv1d 3.64% 76.791us 99.75% 2.102ms 2.102ms 0.000us 0.00% 7.168us 7.168us 1 + CausalConv1dFn 3.46% 72.920us 96.11% 2.025ms 674.997us 0.000us 0.00% 7.168us 2.389us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.08% 22.730us 91.24% 1.923ms 640.840us 5.344us 100.00% 7.168us 2.389us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.344us 100.00% 5.344us 1.781us 3 + Activity Buffer Request 82.66% 1.742ms 82.66% 1.742ms 1.742ms 1.824us 34.13% 1.824us 1.824us 1 + aten::empty_like 0.40% 8.480us 1.40% 29.552us 9.851us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.00% 21.072us 1.00% 21.072us 7.024us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.51% 158.242us 7.51% 158.242us 52.747us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 5.220us 0.25% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 637.557us -Self CUDA time total: 5.376us +Self CPU time total: 2.107ms +Self CUDA time total: 5.344us @@ -4584,19 +4366,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.676us 2174.35% 119.676us 119.676us 1 - hf_kernels_causal_conv1d 14.25% 74.352us 99.01% 516.513us 516.513us 0.000us 0.00% 7.392us 7.392us 1 - CausalConv1dFn 14.02% 73.122us 84.76% 442.161us 147.387us 0.000us 0.00% 7.392us 2.464us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.04% 26.281us 65.18% 340.038us 113.346us 5.504us 100.00% 7.392us 2.464us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.504us 100.00% 5.504us 1.835us 3 - Activity Buffer Request 30.19% 157.524us 30.19% 157.524us 157.524us 1.888us 34.30% 1.888us 1.888us 1 - aten::empty_like 1.50% 7.800us 5.56% 29.001us 9.667us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.06% 21.201us 4.06% 21.201us 7.067us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 29.95% 156.233us 29.95% 156.233us 52.078us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.99% 5.180us 0.99% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 114.399us 2127.96% 114.399us 114.399us 1 + hf_kernels_causal_conv1d 16.62% 75.320us 98.88% 448.097us 448.097us 0.000us 0.00% 7.200us 7.200us 1 + CausalConv1dFn 15.04% 68.172us 82.26% 372.777us 124.259us 0.000us 0.00% 7.200us 2.400us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.05% 22.881us 60.95% 276.214us 92.071us 5.376us 100.00% 7.200us 2.400us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.376us 100.00% 5.376us 1.792us 3 + Activity Buffer Request 20.71% 93.851us 20.71% 93.851us 93.851us 1.824us 33.93% 1.824us 1.824us 1 + aten::empty_like 1.68% 7.630us 6.27% 28.391us 9.464us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.58% 20.761us 4.58% 20.761us 6.920us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 35.19% 159.482us 35.19% 159.482us 53.161us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.12% 5.070us 1.12% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 521.693us -Self CUDA time total: 5.504us +Self CPU time total: 453.167us +Self CUDA time total: 5.376us @@ -4606,19 +4388,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.798us 715.63% 124.798us 124.798us 1 - hf_kernels_causal_conv1d 11.85% 75.293us 99.15% 630.167us 630.167us 0.000us 0.00% 23.295us 23.295us 1 - CausalConv1dFn 11.06% 70.310us 87.30% 554.874us 184.958us 0.000us 0.00% 23.295us 7.765us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.18% 26.540us 71.39% 453.732us 151.244us 17.439us 100.00% 23.295us 7.765us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.439us 100.00% 17.439us 5.813us 3 - Activity Buffer Request 42.20% 268.237us 42.20% 268.237us 268.237us 5.856us 33.58% 5.856us 5.856us 1 - aten::empty_like 1.25% 7.951us 4.85% 30.832us 10.277us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.60% 22.881us 3.60% 22.881us 7.627us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 25.01% 158.955us 25.01% 158.955us 52.985us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.85% 5.410us 0.85% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.887us 696.30% 121.887us 121.887us 1 + hf_kernels_causal_conv1d 3.44% 74.640us 99.77% 2.162ms 2.162ms 0.000us 0.00% 23.361us 23.361us 1 + CausalConv1dFn 3.19% 69.031us 96.32% 2.087ms 695.668us 0.000us 0.00% 23.361us 7.787us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.10% 23.730us 91.78% 1.989ms 662.904us 17.505us 100.00% 23.361us 7.787us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.505us 100.00% 17.505us 5.835us 3 + Activity Buffer Request 82.75% 1.793ms 82.75% 1.793ms 1.793ms 5.856us 33.45% 5.856us 5.856us 1 + aten::empty_like 0.40% 8.582us 1.35% 29.262us 9.754us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.95% 20.680us 0.95% 20.680us 6.893us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.94% 172.113us 7.94% 172.113us 57.371us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.23% 5.069us 0.23% 5.069us 5.069us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 635.577us -Self CUDA time total: 17.439us +Self CPU time total: 2.167ms +Self CUDA time total: 17.505us @@ -4628,19 +4410,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.252us 695.89% 124.252us 124.252us 1 - hf_kernels_causal_conv1d 15.28% 76.213us 99.04% 494.053us 494.053us 0.000us 0.00% 23.839us 23.839us 1 - CausalConv1dFn 14.60% 72.841us 83.76% 417.840us 139.280us 0.000us 0.00% 23.839us 7.946us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.38% 26.851us 63.27% 315.607us 105.202us 17.855us 100.00% 23.839us 7.946us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.855us 100.00% 17.855us 5.952us 3 - Activity Buffer Request 26.40% 131.703us 26.40% 131.703us 131.703us 5.984us 33.51% 5.984us 5.984us 1 - aten::empty_like 1.62% 8.090us 5.89% 29.392us 9.797us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.27% 21.302us 4.27% 21.302us 7.101us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 31.48% 157.053us 31.48% 157.053us 52.351us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.96% 4.810us 0.96% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.997us 664.91% 119.997us 119.997us 1 + hf_kernels_causal_conv1d 16.46% 76.510us 98.91% 459.857us 459.857us 0.000us 0.00% 24.063us 24.063us 1 + CausalConv1dFn 14.99% 69.691us 82.45% 383.347us 127.782us 0.000us 0.00% 24.063us 8.021us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.12% 23.810us 61.53% 286.094us 95.365us 18.047us 100.00% 24.063us 8.021us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.047us 100.00% 18.047us 6.016us 3 + Activity Buffer Request 22.64% 105.271us 22.64% 105.271us 105.271us 6.016us 33.34% 6.016us 6.016us 1 + aten::empty_like 1.59% 7.411us 5.93% 27.562us 9.187us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.33% 20.151us 4.33% 20.151us 6.717us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.77% 157.013us 33.77% 157.013us 52.338us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.09% 5.080us 1.09% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 498.863us -Self CUDA time total: 17.855us +Self CPU time total: 464.937us +Self CUDA time total: 18.047us @@ -4650,19 +4432,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.253us 695.94% 124.253us 124.253us 1 - hf_kernels_causal_conv1d 14.09% 92.581us 99.22% 652.096us 652.096us 0.000us 0.00% 23.838us 23.838us 1 - CausalConv1dFn 11.45% 75.254us 85.13% 559.515us 186.505us 0.000us 0.00% 23.838us 7.946us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.84% 25.251us 69.30% 455.481us 151.827us 17.854us 100.00% 23.838us 7.946us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.854us 100.00% 17.854us 5.951us 3 - Activity Buffer Request 41.42% 272.247us 41.42% 272.247us 272.247us 5.984us 33.52% 5.984us 5.984us 1 - aten::empty_like 1.19% 7.849us 4.38% 28.780us 9.593us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.18% 20.931us 3.18% 20.931us 6.977us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 24.04% 157.983us 24.04% 157.983us 52.661us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.78% 5.140us 0.78% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.983us 701.78% 125.983us 125.983us 1 + hf_kernels_causal_conv1d 3.62% 75.400us 99.76% 2.076ms 2.076ms 0.000us 0.00% 23.968us 23.968us 1 + CausalConv1dFn 3.51% 72.963us 96.14% 2.001ms 667.008us 0.000us 0.00% 23.968us 7.989us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.17% 24.320us 91.19% 1.898ms 632.703us 17.952us 100.00% 23.968us 7.989us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.952us 100.00% 17.952us 5.984us 3 + Activity Buffer Request 82.20% 1.711ms 82.20% 1.711ms 1.711ms 6.016us 33.51% 6.016us 6.016us 1 + aten::empty_like 0.41% 8.499us 1.44% 29.950us 9.983us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.03% 21.451us 1.03% 21.451us 7.150us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 7.83% 162.893us 7.83% 162.893us 54.298us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.24% 4.969us 0.24% 4.969us 4.969us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 657.236us -Self CUDA time total: 17.854us +Self CPU time total: 2.081ms +Self CUDA time total: 17.952us @@ -4672,19 +4454,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.982us 651.61% 121.982us 121.982us 1 - hf_kernels_causal_conv1d 16.26% 76.273us 99.00% 464.343us 464.343us 0.000us 0.00% 25.088us 25.088us 1 - CausalConv1dFn 15.20% 71.302us 82.74% 388.070us 129.357us 0.000us 0.00% 25.088us 8.363us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.49% 25.750us 61.15% 286.808us 95.603us 18.720us 100.00% 25.088us 8.363us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.720us 100.00% 18.720us 6.240us 3 - Activity Buffer Request 22.13% 103.813us 22.13% 103.813us 103.813us 6.368us 34.02% 6.368us 6.368us 1 - aten::empty_like 1.75% 8.210us 6.39% 29.960us 9.987us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.64% 21.750us 4.64% 21.750us 7.250us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 33.53% 157.245us 33.53% 157.245us 52.415us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.00% 4.680us 1.00% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.901us 639.40% 119.901us 119.901us 1 + hf_kernels_causal_conv1d 11.47% 73.600us 99.21% 636.820us 636.820us 0.000us 0.00% 25.088us 25.088us 1 + CausalConv1dFn 11.28% 72.380us 87.74% 563.220us 187.740us 0.000us 0.00% 25.088us 8.363us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 3.65% 23.431us 72.11% 462.887us 154.296us 18.752us 100.00% 25.088us 8.363us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.752us 100.00% 18.752us 6.251us 3 + Activity Buffer Request 43.62% 280.014us 43.62% 280.014us 280.014us 6.336us 33.79% 6.336us 6.336us 1 + aten::empty_like 1.22% 7.832us 4.35% 27.953us 9.318us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.13% 20.121us 3.13% 20.121us 6.707us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.84% 159.442us 24.84% 159.442us 53.147us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.79% 5.080us 0.79% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 469.023us -Self CUDA time total: 18.720us +Self CPU time total: 641.900us +Self CUDA time total: 18.752us @@ -4694,19 +4476,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 4.40% 80.973us 99.73% 1.837ms 1.837ms 0.000us 0.00% 162.749us 162.749us 1 - CausalConv1dFn 4.14% 76.301us 95.33% 1.756ms 585.285us 0.000us 0.00% 162.749us 54.250us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.45% 26.730us 89.50% 1.648ms 549.474us 97.918us 100.00% 162.749us 54.250us 3 - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 141.950us 144.97% 141.950us 141.950us 1 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.918us 100.00% 97.918us 32.639us 3 - Activity Buffer Request 78.99% 1.455ms 78.99% 1.455ms 1.455ms 64.831us 66.21% 64.831us 64.831us 1 - aten::empty_like 0.45% 8.340us 1.69% 31.131us 10.377us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.24% 22.791us 1.24% 22.791us 7.597us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 9.06% 166.885us 9.06% 166.885us 55.628us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.27% 4.980us 0.27% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 11.42% 73.310us 99.16% 636.780us 636.780us 0.000us 0.00% 162.591us 162.591us 1 + CausalConv1dFn 11.12% 71.382us 87.74% 563.470us 187.823us 0.000us 0.00% 162.591us 54.197us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 3.58% 22.989us 72.14% 463.287us 154.429us 97.631us 100.00% 162.591us 54.197us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.208us 133.37% 130.208us 130.208us 1 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.631us 100.00% 97.631us 32.544us 3 + Activity Buffer Request 43.38% 278.604us 43.38% 278.604us 278.604us 64.960us 66.54% 64.960us 64.960us 1 + aten::empty_like 1.24% 7.950us 4.48% 28.801us 9.600us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.25% 20.851us 3.25% 20.851us 6.950us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 25.18% 161.694us 25.18% 161.694us 53.898us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.84% 5.420us 0.84% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.842ms -Self CUDA time total: 97.918us +Self CPU time total: 642.200us +Self CUDA time total: 97.631us @@ -4716,19 +4498,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 16.07% 76.871us 98.94% 473.172us 473.172us 0.000us 0.00% 163.803us 163.803us 1 - CausalConv1dFn 14.96% 71.532us 82.87% 396.301us 132.100us 0.000us 0.00% 163.803us 54.601us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.75% 27.501us 61.56% 294.418us 98.139us 98.685us 100.00% 163.803us 54.601us 3 - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.180us 134.95% 133.180us 133.180us 1 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.685us 100.00% 98.685us 32.895us 3 - Activity Buffer Request 21.65% 103.543us 21.65% 103.543us 103.543us 65.118us 65.99% 65.118us 65.118us 1 - aten::empty_like 1.52% 7.251us 6.35% 30.351us 10.117us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.83% 23.100us 4.83% 23.100us 7.700us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.16% 163.374us 34.16% 163.374us 54.458us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.06% 5.061us 1.06% 5.061us 5.061us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 13.89% 72.060us 98.98% 513.378us 513.378us 0.000us 0.00% 163.263us 163.263us 1 + CausalConv1dFn 13.96% 72.421us 85.08% 441.318us 147.106us 0.000us 0.00% 163.263us 54.421us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.45% 23.099us 65.49% 339.676us 113.225us 98.623us 100.00% 163.263us 54.421us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.111us 131.93% 130.111us 130.111us 1 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.623us 100.00% 98.623us 32.874us 3 + Activity Buffer Request 30.19% 156.612us 30.19% 156.612us 156.612us 64.640us 65.54% 64.640us 64.640us 1 + aten::empty_like 1.62% 8.391us 5.63% 29.221us 9.740us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.02% 20.830us 4.02% 20.830us 6.943us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 30.84% 159.965us 30.84% 159.965us 53.322us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.02% 5.310us 1.02% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 478.233us -Self CUDA time total: 98.685us +Self CPU time total: 518.688us +Self CUDA time total: 98.623us impl wl p50(ms) ok @@ -4738,7 +4520,7 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True +hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.04 True hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True @@ -4760,13 +4542,14 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True▶ UV Install LogsFetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 9%|�� | 1/11 [00:00<00:01, 9.42it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:00, 4.98it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 7.98it/s]+Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.51it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.51it/s]Artifacts:
causal_conv1d.jsonl