diff --git "a/openai_moe/impls/gpt_oss_moe.html" "b/openai_moe/impls/gpt_oss_moe.html" --- "a/openai_moe/impls/gpt_oss_moe.html" +++ "b/openai_moe/impls/gpt_oss_moe.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:00:34 2025 +Mon Nov 10 21:58:43 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 34C P0 81W / 350W | 0MiB / 46068MiB | 18% Default | +| N/A 31C P0 78W / 350W | 0MiB / 46068MiB | 17% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4156,7 +3938,7 @@ Cell: nv | 0.24s ▼ output ▶ uv-logs | -Cell: benchmark | 24.32s +Cell: benchmark | 25.04s | Raw @@ -4260,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.211ms 197.81% 10.211ms 10.211ms 1 - gpt_oss_experts 16.48% 2.023ms 99.94% 12.270ms 12.270ms 0.000us 0.00% 5.165ms 5.165ms 1 - aten::matmul 0.22% 26.489us 3.82% 468.520us 39.043us 0.000us 0.00% 4.540ms 378.357us 12 - aten::mm 2.36% 289.825us 3.60% 442.031us 36.836us 4.540ms 87.96% 4.540ms 378.357us 12 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.078ms 59.62% 3.078ms 341.948us 9 -void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.457ms 28.23% 1.457ms 485.813us 3 - aten::mul 1.42% 174.948us 2.34% 287.701us 11.988us 109.119us 2.11% 109.119us 4.547us 24 - aten::add 1.61% 197.786us 3.85% 472.357us 26.242us 103.039us 2.00% 103.039us 5.724us 18 - aten::index 1.73% 212.127us 2.86% 350.900us 29.242us 86.591us 1.68% 86.591us 7.216us 12 - aten::index_add_ 0.51% 62.499us 0.79% 97.312us 16.219us 82.688us 1.60% 82.688us 13.781us 6 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 82.688us 1.60% 82.688us 13.781us 6 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.511us 1.56% 80.511us 6.709us 12 - aten::nonzero 2.20% 270.146us 6.58% 808.380us 89.820us 63.743us 1.23% 74.368us 8.263us 9 - aten::clamp 0.98% 120.045us 1.63% 200.026us 16.669us 64.705us 1.25% 64.705us 5.392us 12 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 64.705us 1.25% 64.705us 5.392us 12 - aten::where 0.06% 7.400us 5.25% 644.007us 107.334us 0.000us 0.00% 60.384us 10.064us 6 - aten::nonzero_numpy 0.11% 13.320us 5.19% 636.607us 106.101us 0.000us 0.00% 60.384us 10.064us 6 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.063us 1.16% 60.063us 10.011us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.800us 1.10% 56.800us 4.733us 12 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 50.911us 0.99% 50.911us 1.131us 45 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.360ms 190.98% 10.360ms 10.360ms 1 + gpt_oss_experts 15.12% 1.924ms 99.94% 12.713ms 12.713ms 0.000us 0.00% 5.428ms 5.428ms 1 + aten::matmul 0.18% 22.311us 3.73% 473.846us 39.487us 0.000us 0.00% 4.800ms 400.041us 12 + aten::mm 2.34% 297.100us 3.55% 451.535us 37.628us 4.800ms 88.50% 4.800ms 400.041us 12 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.258ms 60.07% 3.258ms 362.028us 9 +void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.536ms 28.31% 1.536ms 511.862us 3 + aten::mul 1.29% 163.978us 2.14% 271.630us 11.318us 109.411us 2.02% 109.411us 4.559us 24 + aten::add 1.51% 192.130us 3.80% 483.423us 26.857us 103.358us 1.91% 103.358us 5.742us 18 + aten::index 1.52% 193.374us 2.62% 333.164us 27.764us 88.224us 1.63% 88.224us 7.352us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.864us 1.49% 80.864us 6.739us 12 + aten::index_add_ 0.46% 58.130us 0.76% 97.241us 16.207us 80.064us 1.48% 80.064us 13.344us 6 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 80.064us 1.48% 80.064us 13.344us 6 + aten::nonzero 2.05% 260.439us 6.29% 799.492us 88.832us 65.278us 1.20% 76.093us 8.455us 9 + aten::clamp 0.99% 126.442us 1.60% 203.852us 16.988us 63.456us 1.17% 63.456us 5.288us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.456us 1.17% 63.456us 5.288us 12 + aten::where 0.06% 7.391us 5.01% 637.190us 106.198us 0.000us 0.00% 61.533us 10.256us 6 + aten::nonzero_numpy 0.09% 11.880us 4.95% 629.799us 104.967us 0.000us 0.00% 61.533us 10.256us 6 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.544us 1.12% 60.544us 10.091us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.929us 1.05% 56.929us 4.744us 12 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 51.073us 0.94% 51.073us 1.135us 45 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 12.278ms -Self CUDA time total: 5.162ms +Self CPU time total: 12.720ms +Self CUDA time total: 5.425ms @@ -4292,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.933ms 229.38% 13.933ms 13.933ms 1 - gpt_oss_experts 16.29% 2.560ms 99.97% 15.712ms 15.712ms 0.000us 0.00% 6.077ms 6.077ms 1 - aten::matmul 0.30% 47.223us 5.17% 812.581us 33.858us 0.000us 0.00% 5.268ms 219.512us 24 - aten::mm 3.09% 485.951us 4.87% 765.358us 31.890us 5.268ms 86.73% 5.268ms 219.512us 24 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.213ms 85.81% 5.213ms 217.198us 24 - aten::nonzero 2.45% 385.408us 7.89% 1.240ms 82.649us 112.163us 1.85% 134.498us 8.967us 15 - aten::mul 2.03% 318.275us 3.36% 528.222us 11.005us 130.496us 2.15% 130.496us 2.719us 48 - aten::add 2.25% 353.820us 3.74% 587.771us 16.327us 127.072us 2.09% 127.072us 3.530us 36 - aten::where 0.08% 11.882us 7.49% 1.177ms 98.080us 0.000us 0.00% 120.705us 10.059us 12 - aten::nonzero_numpy 0.15% 24.083us 7.41% 1.165ms 97.090us 0.000us 0.00% 120.705us 10.059us 12 - aten::index 2.31% 363.442us 3.93% 617.030us 25.710us 110.145us 1.81% 110.145us 4.589us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.312us 1.67% 101.312us 4.221us 24 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.447us 1.51% 91.447us 1.051us 87 - aten::clamp 1.32% 207.076us 2.26% 355.011us 14.792us 85.793us 1.41% 85.793us 3.575us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 85.793us 1.41% 85.793us 3.575us 24 - aten::item 0.52% 81.620us 38.60% 6.066ms 84.255us 0.000us 0.00% 75.446us 1.048us 72 - aten::_local_scalar_dense 2.00% 315.046us 38.08% 5.985ms 83.122us 75.446us 1.24% 75.446us 1.048us 72 - aten::index_add_ 0.75% 118.511us 1.16% 182.084us 15.174us 72.926us 1.20% 72.926us 6.077us 12 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 72.926us 1.20% 72.926us 6.077us 12 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 65.857us 1.08% 65.857us 5.488us 12 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 13.942ms 218.38% 13.942ms 13.942ms 1 + gpt_oss_experts 15.57% 2.499ms 99.97% 16.048ms 16.048ms 0.000us 0.00% 6.387ms 6.387ms 1 + aten::matmul 0.25% 39.461us 4.79% 769.170us 32.049us 0.000us 0.00% 5.570ms 232.102us 24 + aten::mm 2.77% 444.894us 4.55% 729.709us 30.405us 5.570ms 87.25% 5.570ms 232.102us 24 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.515ms 86.38% 5.515ms 229.794us 24 + aten::nonzero 2.34% 374.919us 7.60% 1.220ms 81.308us 114.786us 1.80% 137.349us 9.157us 15 + aten::mul 1.86% 298.668us 3.09% 496.508us 10.344us 131.614us 2.06% 131.614us 2.742us 48 + aten::add 2.06% 330.439us 3.47% 556.980us 15.472us 127.904us 2.00% 127.904us 3.553us 36 + aten::where 0.07% 11.120us 7.17% 1.151ms 95.939us 0.000us 0.00% 123.109us 10.259us 12 + aten::nonzero_numpy 0.13% 20.771us 7.10% 1.140ms 95.012us 0.000us 0.00% 123.109us 10.259us 12 + aten::index 2.15% 344.365us 3.72% 597.667us 24.903us 111.391us 1.74% 111.391us 4.641us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.985us 1.60% 101.985us 4.249us 24 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.395us 1.43% 91.395us 1.051us 87 + aten::clamp 1.30% 208.833us 2.21% 355.215us 14.801us 88.257us 1.38% 88.257us 3.677us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.257us 1.38% 88.257us 3.677us 24 + aten::item 0.49% 78.042us 39.66% 6.367ms 88.433us 0.000us 0.00% 75.297us 1.046us 72 + aten::_local_scalar_dense 1.92% 308.797us 39.18% 6.289ms 87.349us 75.297us 1.18% 75.297us 1.046us 72 + aten::index_add_ 0.59% 94.029us 0.99% 158.640us 13.220us 71.454us 1.12% 71.454us 5.954us 12 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.454us 1.12% 71.454us 5.954us 12 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.271us 1.04% 66.271us 5.523us 12 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 15.717ms -Self CUDA time total: 6.074ms +Self CPU time total: 16.053ms +Self CUDA time total: 6.384ms @@ -4324,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.540ms 148.48% 12.540ms 12.540ms 1 - gpt_oss_experts 11.83% 1.734ms 99.96% 14.654ms 14.654ms 0.000us 0.00% 8.451ms 8.451ms 1 - aten::matmul 0.16% 23.602us 3.00% 439.592us 36.633us 0.000us 0.00% 7.417ms 618.087us 12 - aten::mm 1.78% 261.037us 2.84% 415.990us 34.666us 7.417ms 87.82% 7.417ms 618.087us 12 -void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.532ms 53.65% 4.532ms 755.263us 6 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.475ms 17.46% 1.475ms 491.509us 3 -void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.405ms 16.64% 1.405ms 468.490us 3 - aten::mul 1.05% 153.262us 1.78% 261.173us 10.882us 197.791us 2.34% 197.791us 8.241us 24 - aten::add 1.26% 184.574us 2.07% 304.007us 16.889us 188.543us 2.23% 188.543us 10.475us 18 - aten::index_add_ 0.35% 50.951us 0.57% 83.553us 13.925us 169.408us 2.01% 169.408us 28.235us 6 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 169.408us 2.01% 169.408us 28.235us 6 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 149.663us 1.77% 149.663us 12.472us 12 - aten::index 1.27% 186.102us 2.16% 316.927us 26.411us 146.942us 1.74% 146.942us 12.245us 12 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 117.440us 1.39% 117.440us 19.573us 6 - aten::clamp 0.71% 104.743us 1.22% 178.924us 14.910us 110.912us 1.31% 110.912us 9.243us 12 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 110.912us 1.31% 110.912us 9.243us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.864us 1.24% 104.864us 8.739us 12 - aten::nonzero 1.58% 232.211us 4.94% 724.348us 80.483us 69.633us 0.82% 81.377us 9.042us 9 - aten::where 0.04% 6.259us 4.08% 597.684us 99.614us 0.000us 0.00% 66.816us 11.136us 6 - aten::nonzero_numpy 0.08% 11.999us 4.03% 591.425us 98.571us 0.000us 0.00% 66.816us 11.136us 6 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.597ms 146.28% 12.597ms 12.597ms 1 + gpt_oss_experts 11.26% 1.671ms 99.96% 14.835ms 14.835ms 0.000us 0.00% 8.616ms 8.616ms 1 + aten::matmul 0.13% 19.980us 2.85% 423.596us 35.300us 0.000us 0.00% 7.614ms 634.486us 12 + aten::mm 1.70% 251.563us 2.72% 403.616us 33.635us 7.614ms 88.42% 7.614ms 634.486us 12 +void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.628ms 53.74% 4.628ms 771.312us 6 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.524ms 17.70% 1.524ms 508.107us 3 +void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.455ms 16.90% 1.455ms 485.046us 3 + aten::mul 1.00% 148.488us 1.71% 253.960us 10.582us 188.737us 2.19% 188.737us 7.864us 24 + aten::add 1.14% 169.821us 1.97% 292.395us 16.244us 180.606us 2.10% 180.606us 10.034us 18 + aten::index_add_ 0.32% 47.691us 0.57% 84.001us 14.000us 164.000us 1.90% 164.000us 27.333us 6 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.000us 1.90% 164.000us 27.333us 6 + aten::index 1.23% 181.951us 2.12% 314.145us 26.179us 144.608us 1.68% 144.608us 12.051us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 142.815us 1.66% 142.815us 11.901us 12 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 114.816us 1.33% 114.816us 19.136us 6 + aten::clamp 0.72% 107.083us 1.24% 184.134us 15.345us 106.818us 1.24% 106.818us 8.902us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 106.818us 1.24% 106.818us 8.902us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 100.513us 1.17% 100.513us 8.376us 12 + aten::nonzero 1.51% 224.830us 4.84% 718.263us 79.807us 68.894us 0.80% 80.029us 8.892us 9 + aten::where 0.04% 5.681us 3.95% 586.411us 97.735us 0.000us 0.00% 65.405us 10.901us 6 + aten::nonzero_numpy 0.07% 10.160us 3.91% 580.730us 96.788us 0.000us 0.00% 65.405us 10.901us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 14.659ms -Self CUDA time total: 8.446ms +Self CPU time total: 14.841ms +Self CUDA time total: 8.611ms @@ -4356,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.317ms 174.31% 18.317ms 18.317ms 1 - gpt_oss_experts 13.54% 2.761ms 99.97% 20.385ms 20.385ms 0.000us 0.00% 10.514ms 10.514ms 1 - aten::matmul 0.23% 47.082us 4.02% 819.853us 34.161us 0.000us 0.00% 9.237ms 384.865us 24 - aten::mm 2.37% 482.255us 3.79% 772.771us 32.199us 9.237ms 87.90% 9.237ms 384.865us 24 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.282ms 59.78% 6.282ms 349.001us 18 -void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.944ms 28.01% 2.944ms 490.655us 6 - aten::mul 1.50% 305.331us 2.55% 520.818us 10.850us 235.298us 2.24% 235.298us 4.902us 48 - aten::add 1.72% 351.113us 2.86% 584.036us 16.223us 213.502us 2.03% 213.502us 5.931us 36 - aten::index 1.95% 397.314us 3.28% 668.454us 27.852us 205.349us 1.95% 205.349us 8.556us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 166.720us 1.59% 166.720us 6.947us 24 - aten::index_add_ 0.50% 101.340us 0.81% 165.573us 13.798us 155.585us 1.48% 155.585us 12.965us 12 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 155.585us 1.48% 155.585us 12.965us 12 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 146.947us 1.40% 146.947us 12.246us 12 - aten::nonzero 1.95% 398.176us 6.26% 1.276ms 85.090us 121.380us 1.16% 145.668us 9.711us 15 - aten::clamp 1.04% 212.193us 1.79% 365.180us 15.216us 134.239us 1.28% 134.239us 5.593us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.239us 1.28% 134.239us 5.593us 24 - aten::where 0.06% 11.340us 5.97% 1.216ms 101.373us 0.000us 0.00% 131.522us 10.960us 12 - aten::nonzero_numpy 0.12% 24.140us 5.91% 1.205ms 100.428us 0.000us 0.00% 131.522us 10.960us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 119.840us 1.14% 119.840us 4.993us 24 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 100.830us 0.96% 100.830us 1.159us 87 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.460ms 171.74% 18.460ms 18.460ms 1 + gpt_oss_experts 12.58% 2.618ms 99.97% 20.806ms 20.806ms 0.000us 0.00% 10.754ms 10.754ms 1 + aten::matmul 0.19% 39.724us 3.85% 801.313us 33.388us 0.000us 0.00% 9.496ms 395.681us 24 + aten::mm 2.21% 460.813us 3.66% 761.589us 31.733us 9.496ms 88.35% 9.496ms 395.681us 24 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.491ms 60.39% 6.491ms 360.603us 18 +void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.993ms 27.84% 2.993ms 498.774us 6 + aten::mul 2.25% 467.369us 3.28% 683.452us 14.239us 226.014us 2.10% 226.014us 4.709us 48 + aten::add 1.60% 332.210us 2.74% 569.351us 15.815us 207.013us 1.93% 207.013us 5.750us 36 + aten::index 1.72% 357.427us 2.99% 622.664us 25.944us 203.329us 1.89% 203.329us 8.472us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 162.243us 1.51% 162.243us 6.760us 24 + aten::index_add_ 0.45% 94.395us 0.78% 161.485us 13.457us 155.167us 1.44% 155.167us 12.931us 12 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 155.167us 1.44% 155.167us 12.931us 12 + aten::nonzero 1.86% 386.184us 6.07% 1.263ms 84.202us 120.989us 1.13% 144.894us 9.660us 15 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 144.769us 1.35% 144.769us 12.064us 12 + aten::where 0.05% 10.779us 5.71% 1.188ms 99.031us 0.000us 0.00% 130.270us 10.856us 12 + aten::nonzero_numpy 0.10% 20.452us 5.66% 1.178ms 98.133us 0.000us 0.00% 130.270us 10.856us 12 + aten::clamp 1.04% 217.185us 1.79% 373.407us 15.559us 129.252us 1.20% 129.252us 5.386us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 129.252us 1.20% 129.252us 5.386us 24 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.584us 1.08% 115.584us 4.816us 24 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 107.234us 1.00% 107.234us 1.233us 87 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 20.390ms -Self CUDA time total: 10.509ms +Self CPU time total: 20.812ms +Self CUDA time total: 10.749ms @@ -4388,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 21.031ms 119.92% 21.031ms 21.031ms 1 - gpt_oss_experts 7.59% 1.747ms 99.98% 23.024ms 23.024ms 0.000us 0.00% 17.548ms 17.548ms 1 - aten::matmul 0.10% 23.660us 1.94% 446.020us 37.168us 0.000us 0.00% 14.659ms 1.222ms 12 - aten::mm 1.17% 268.524us 1.83% 422.360us 35.197us 14.659ms 83.59% 14.659ms 1.222ms 12 -void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 8.967ms 51.13% 8.967ms 1.495ms 6 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.685ms 32.42% 5.685ms 947.562us 6 - aten::add 0.82% 187.722us 1.36% 312.616us 17.368us 785.408us 4.48% 785.408us 43.634us 18 - aten::mul 0.68% 156.369us 1.15% 264.222us 11.009us 674.688us 3.85% 674.688us 28.112us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 504.575us 2.88% 504.575us 42.048us 12 - aten::index_add_ 0.22% 50.951us 0.37% 86.132us 14.355us 448.545us 2.56% 448.545us 74.757us 6 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 448.545us 2.56% 448.545us 74.757us 6 - aten::clamp 0.46% 107.053us 0.80% 183.295us 15.275us 336.000us 1.92% 336.000us 28.000us 12 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 336.000us 1.92% 336.000us 28.000us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 314.239us 1.79% 314.239us 52.373us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 280.833us 1.60% 280.833us 46.806us 6 - aten::index 0.81% 185.806us 1.39% 320.548us 26.712us 259.102us 1.48% 259.102us 21.592us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 258.944us 1.48% 258.944us 21.579us 12 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 225.407us 1.29% 225.407us 37.568us 6 - aten::sigmoid 0.16% 36.131us 0.27% 61.901us 10.317us 175.073us 1.00% 175.073us 29.179us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 175.073us 1.00% 175.073us 29.179us 6 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 21.083ms 119.21% 21.083ms 21.083ms 1 + gpt_oss_experts 7.12% 1.665ms 99.98% 23.365ms 23.365ms 0.000us 0.00% 17.695ms 17.695ms 1 + aten::matmul 0.09% 20.129us 1.89% 441.429us 36.786us 0.000us 0.00% 14.828ms 1.236ms 12 + aten::mm 1.11% 260.517us 1.80% 421.300us 35.108us 14.828ms 83.84% 14.828ms 1.236ms 12 +void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.047ms 51.15% 9.047ms 1.508ms 6 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.773ms 32.64% 5.773ms 962.167us 6 + aten::add 0.74% 174.025us 1.27% 296.156us 16.453us 776.579us 4.39% 776.579us 43.143us 18 + aten::mul 0.64% 149.555us 1.10% 257.226us 10.718us 654.338us 3.70% 654.338us 27.264us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 499.874us 2.83% 499.874us 41.656us 12 + aten::index_add_ 0.21% 48.400us 0.36% 84.241us 14.040us 449.985us 2.54% 449.985us 74.998us 6 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 449.985us 2.54% 449.985us 74.998us 6 + aten::clamp 0.46% 107.321us 0.79% 185.253us 15.438us 329.054us 1.86% 329.054us 27.421us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 329.054us 1.86% 329.054us 27.421us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 300.737us 1.70% 300.737us 50.123us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 276.705us 1.56% 276.705us 46.117us 6 + aten::index 0.76% 178.051us 1.32% 309.462us 25.788us 268.800us 1.52% 268.800us 22.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 253.889us 1.44% 253.889us 21.157us 12 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 236.095us 1.33% 236.095us 39.349us 6 + aten::sigmoid 0.16% 36.571us 0.27% 63.572us 10.595us 176.833us 1.00% 176.833us 29.472us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 176.833us 1.00% 176.833us 29.472us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 23.030ms -Self CUDA time total: 17.537ms +Self CPU time total: 23.371ms +Self CUDA time total: 17.686ms @@ -4420,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.377ms 140.11% 24.377ms 24.377ms 1 - gpt_oss_experts 10.50% 2.651ms 99.98% 25.237ms 25.237ms 0.000us 0.00% 17.408ms 17.408ms 1 - aten::matmul 0.19% 47.519us 3.41% 860.801us 35.867us 0.000us 0.00% 15.185ms 632.705us 24 - aten::mm 2.06% 521.061us 3.22% 813.282us 33.887us 15.185ms 87.28% 15.185ms 632.705us 24 -void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.179ms 52.76% 9.179ms 764.922us 12 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.124ms 17.96% 3.124ms 520.682us 6 -void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.871ms 16.50% 2.871ms 478.432us 6 - aten::add 1.42% 359.495us 2.37% 598.003us 16.611us 427.713us 2.46% 427.713us 11.881us 36 - aten::mul 1.23% 309.946us 2.09% 527.073us 10.981us 420.510us 2.42% 420.510us 8.761us 48 - aten::index_add_ 0.40% 101.283us 0.66% 166.886us 13.907us 383.489us 2.20% 383.489us 31.957us 12 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 383.489us 2.20% 383.489us 31.957us 12 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 343.712us 1.98% 343.712us 14.321us 24 - aten::index 1.56% 393.991us 2.62% 662.158us 27.590us 337.086us 1.94% 337.086us 14.045us 24 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 272.926us 1.57% 272.926us 22.744us 12 - aten::clamp 0.84% 212.993us 1.44% 363.038us 15.127us 230.431us 1.32% 230.431us 9.601us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 230.431us 1.32% 230.431us 9.601us 24 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 223.071us 1.28% 223.071us 9.295us 24 - aten::nonzero 1.57% 395.401us 5.00% 1.262ms 84.127us 128.836us 0.74% 156.164us 10.411us 15 - aten::where 0.05% 12.011us 4.77% 1.205ms 100.378us 0.000us 0.00% 140.900us 11.742us 12 - aten::nonzero_numpy 0.10% 25.021us 4.72% 1.193ms 99.377us 0.000us 0.00% 140.900us 11.742us 12 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.709ms 139.35% 24.709ms 24.709ms 1 + gpt_oss_experts 9.76% 2.650ms 99.98% 27.156ms 27.156ms 0.000us 0.00% 17.741ms 17.741ms 1 + aten::matmul 0.15% 40.162us 3.17% 860.144us 35.839us 0.000us 0.00% 15.537ms 647.383us 24 + aten::mm 1.90% 517.331us 3.02% 819.982us 34.166us 15.537ms 87.63% 15.537ms 647.383us 24 +void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.352ms 52.74% 9.352ms 779.317us 12 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.225ms 18.19% 3.225ms 537.452us 6 +void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.947ms 16.62% 2.947ms 491.169us 6 + aten::add 1.29% 349.077us 2.22% 601.999us 16.722us 419.552us 2.37% 419.552us 11.654us 36 + aten::mul 1.15% 311.953us 1.98% 539.014us 11.229us 410.371us 2.31% 410.371us 8.549us 48 + aten::index_add_ 0.36% 97.270us 0.61% 164.412us 13.701us 379.682us 2.14% 379.682us 31.640us 12 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 379.682us 2.14% 379.682us 31.640us 12 + aten::index 1.31% 354.897us 2.36% 641.129us 26.714us 344.639us 1.94% 344.639us 14.360us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.056us 1.90% 337.056us 14.044us 24 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 280.607us 1.58% 280.607us 23.384us 12 + aten::clamp 0.78% 212.661us 1.36% 368.626us 15.359us 225.662us 1.27% 225.662us 9.403us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 225.662us 1.27% 225.662us 9.403us 24 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 218.112us 1.23% 218.112us 9.088us 24 + aten::nonzero 1.41% 383.824us 4.68% 1.271ms 84.702us 127.715us 0.72% 153.604us 10.240us 15 + aten::where 0.04% 11.073us 4.43% 1.203ms 100.252us 0.000us 0.00% 138.052us 11.504us 12 + aten::nonzero_numpy 0.07% 20.230us 4.39% 1.192ms 99.329us 0.000us 0.00% 138.052us 11.504us 12 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 25.242ms -Self CUDA time total: 17.398ms +Self CPU time total: 27.162ms +Self CUDA time total: 17.731ms @@ -4452,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.556ms 109.47% 40.556ms 40.556ms 1 - gpt_oss_experts 4.33% 1.794ms 99.85% 41.353ms 41.353ms 0.000us 0.00% 37.080ms 37.080ms 1 - aten::matmul 0.06% 24.371us 1.08% 445.903us 37.159us 0.000us 0.00% 27.082ms 2.257ms 12 - aten::mm 0.70% 291.738us 1.02% 421.532us 35.128us 27.082ms 73.10% 27.082ms 2.257ms 12 -void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 27.079ms 73.09% 27.079ms 2.257ms 12 - aten::mul 0.38% 159.199us 0.65% 268.178us 11.174us 2.983ms 8.05% 2.983ms 124.287us 24 - aten::add 0.48% 198.424us 1.09% 451.763us 25.098us 2.404ms 6.49% 2.404ms 133.559us 18 - aten::clamp 0.27% 112.290us 0.46% 189.433us 15.786us 2.392ms 6.46% 2.392ms 199.373us 12 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.392ms 6.46% 2.392ms 199.373us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.988ms 5.37% 1.988ms 165.669us 12 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.629ms 4.40% 1.629ms 135.763us 12 - aten::index_add_ 0.12% 50.103us 0.20% 84.453us 14.076us 899.456us 2.43% 899.456us 149.909us 6 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 899.456us 2.43% 899.456us 149.909us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 774.912us 2.09% 774.912us 129.152us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 733.217us 1.98% 733.217us 122.203us 6 - aten::index 0.45% 187.302us 0.77% 318.787us 26.566us 712.767us 1.92% 712.767us 59.397us 12 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 678.496us 1.83% 678.496us 113.083us 6 - aten::sigmoid 0.09% 36.082us 0.15% 63.023us 10.504us 323.008us 0.87% 323.008us 53.835us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 323.008us 0.87% 323.008us 53.835us 6 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 261.631us 0.71% 261.631us 43.605us 6 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.750ms 109.03% 40.750ms 40.750ms 1 + gpt_oss_experts 4.08% 1.695ms 99.82% 41.512ms 41.512ms 0.000us 0.00% 37.407ms 37.407ms 1 + aten::matmul 0.05% 20.951us 1.02% 424.118us 35.343us 0.000us 0.00% 27.409ms 2.284ms 12 + aten::mm 0.67% 277.566us 0.97% 403.167us 33.597us 27.409ms 73.34% 27.409ms 2.284ms 12 +void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 27.406ms 73.33% 27.406ms 2.284ms 12 + aten::mul 0.37% 154.550us 0.63% 261.852us 10.911us 2.976ms 7.96% 2.976ms 124.014us 24 + aten::add 0.45% 185.160us 1.07% 445.895us 24.772us 2.401ms 6.42% 2.401ms 133.369us 18 + aten::clamp 0.28% 116.599us 0.48% 198.482us 16.540us 2.391ms 6.40% 2.391ms 199.291us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.391ms 6.40% 2.391ms 199.291us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.983ms 5.30% 1.983ms 165.222us 12 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.625ms 4.35% 1.625ms 135.419us 12 + aten::index_add_ 0.12% 48.080us 0.21% 86.751us 14.459us 910.402us 2.44% 910.402us 151.734us 6 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 910.402us 2.44% 910.402us 151.734us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 775.618us 2.08% 775.618us 129.270us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 740.611us 1.98% 740.611us 123.435us 6 + aten::index 0.44% 181.234us 0.76% 317.848us 26.487us 714.884us 1.91% 714.884us 59.574us 12 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 681.379us 1.82% 681.379us 113.563us 6 + aten::sigmoid 0.09% 38.611us 0.16% 65.922us 10.987us 320.927us 0.86% 320.927us 53.488us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 320.927us 0.86% 320.927us 53.488us 6 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 253.057us 0.68% 253.057us 42.176us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 41.415ms -Self CUDA time total: 37.046ms +Self CPU time total: 41.585ms +Self CUDA time total: 37.374ms @@ -4484,55 +4266,54 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 41.050ms 117.27% 41.050ms 41.050ms 1 - gpt_oss_experts 6.46% 2.709ms 99.99% 41.912ms 41.912ms 0.000us 0.00% 35.025ms 35.025ms 1 - aten::matmul 0.11% 47.590us 2.12% 888.873us 37.036us 0.000us 0.00% 29.051ms 1.210ms 24 - aten::mm 1.28% 536.727us 2.01% 841.283us 35.053us 29.051ms 82.99% 29.051ms 1.210ms 24 -void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.585ms 58.81% 20.585ms 1.372ms 15 - ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.453ms 24.15% 8.453ms 939.204us 9 - aten::add 0.88% 367.610us 1.45% 609.056us 16.918us 1.486ms 4.24% 1.486ms 41.264us 36 - aten::mul 0.74% 309.128us 1.24% 518.283us 10.798us 1.380ms 3.94% 1.380ms 28.757us 48 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 925.695us 2.64% 925.695us 38.571us 24 - aten::index_add_ 0.24% 99.111us 0.40% 167.273us 13.939us 903.487us 2.58% 903.487us 75.291us 12 -void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 903.487us 2.58% 903.487us 75.291us 12 - aten::clamp 0.51% 214.986us 0.87% 364.790us 15.200us 775.806us 2.22% 775.806us 32.325us 24 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 775.806us 2.22% 775.806us 32.325us 24 - aten::index 0.89% 373.269us 1.50% 629.207us 26.217us 670.881us 1.92% 670.881us 27.953us 24 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 631.200us 1.80% 631.200us 52.600us 12 -void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 600.224us 1.71% 600.224us 50.019us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 559.808us 1.60% 559.808us 46.651us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 540.611us 1.54% 540.611us 22.525us 24 - aten::sigmoid 0.17% 72.182us 0.29% 123.582us 10.298us 351.039us 1.00% 351.039us 29.253us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 351.039us 1.00% 351.039us 29.253us 12 + gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 41.218ms 116.52% 41.218ms 41.218ms 1 + gpt_oss_experts 6.00% 2.524ms 99.99% 42.088ms 42.088ms 0.000us 0.00% 35.395ms 35.395ms 1 + aten::matmul 0.10% 40.160us 2.08% 875.043us 36.460us 0.000us 0.00% 29.436ms 1.226ms 24 + aten::mm 1.24% 520.099us 1.98% 834.883us 34.787us 29.436ms 83.21% 29.436ms 1.226ms 24 +void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.785ms 58.75% 20.785ms 1.386ms 15 + ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.635ms 24.41% 8.635ms 959.410us 9 + aten::add 0.83% 349.812us 1.43% 602.505us 16.736us 1.482ms 4.19% 1.482ms 41.161us 36 + aten::mul 0.72% 302.661us 1.25% 525.878us 10.956us 1.369ms 3.87% 1.369ms 28.527us 48 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 928.163us 2.62% 928.163us 38.673us 24 + aten::index_add_ 0.23% 95.791us 0.40% 170.382us 14.198us 908.198us 2.57% 908.198us 75.683us 12 +void at::native::indexFuncLargeIndex<float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 908.198us 2.57% 908.198us 75.683us 12 + aten::clamp 0.52% 220.263us 0.90% 378.355us 15.765us 771.551us 2.18% 771.551us 32.148us 24 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 771.551us 2.18% 771.551us 32.148us 24 + aten::index 0.83% 351.191us 1.46% 613.487us 25.562us 665.121us 1.88% 665.121us 27.713us 24 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 648.065us 1.83% 648.065us 54.005us 12 +void at::native::vectorized_gather_kernel<16, long>(... 0.00% 0.000us 0.00% 0.000us 0.000us 594.560us 1.68% 594.560us 49.547us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 553.635us 1.57% 553.635us 46.136us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 519.010us 1.47% 519.010us 21.625us 24 + aten::sigmoid 0.17% 72.451us 0.30% 125.701us 10.475us 356.257us 1.01% 356.257us 29.688us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 356.257us 1.01% 356.257us 29.688us 12 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 41.917ms -Self CUDA time total: 35.005ms +Self CPU time total: 42.094ms +Self CUDA time total: 35.375ms impl wl p50(ms) ok -gpt_oss_experts cuda_B1_S1024_E2 3.79 True -gpt_oss_experts cuda_B1_S1024_E4 5.24 True -gpt_oss_experts cuda_B1_S512_E2 2.63 True -gpt_oss_experts cuda_B1_S512_E4 3.89 True -gpt_oss_experts cuda_B4_S1024_E2 13.28 True -gpt_oss_experts cuda_B4_S1024_E4 13.19 True -gpt_oss_experts cuda_B4_S512_E2 6.74 True -gpt_oss_experts cuda_B4_S512_E4 7.36 True +gpt_oss_experts cuda_B1_S1024_E2 3.84 True +gpt_oss_experts cuda_B1_S1024_E4 5.30 True +gpt_oss_experts cuda_B1_S512_E2 2.68 True +gpt_oss_experts cuda_B1_S512_E4 3.91 True +gpt_oss_experts cuda_B4_S1024_E2 13.35 True +gpt_oss_experts cuda_B4_S1024_E4 13.35 True +gpt_oss_experts cuda_B4_S512_E2 6.80 True +gpt_oss_experts cuda_B4_S512_E4 7.46 True▶ UV Install LogsFetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 33%|███▎ | 2/6 [00:00<00:00, 16.13it/s] -Fetching 6 files: 67%|██████▋ | 4/6 [00:00<00:00, 7.33it/s] -Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 11.97it/s]+Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 3.54it/s] +Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 7.08it/s]