diff --git "a/openai_moe/impls/gpt_oss_moe.html" "b/openai_moe/impls/gpt_oss_moe.html" --- "a/openai_moe/impls/gpt_oss_moe.html" +++ "b/openai_moe/impls/gpt_oss_moe.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.22s | Raw @@ -4123,16 +3905,16 @@ Cell: nv | 0.24s
-
Fri Oct 31 20:00:34 2025       
+
Mon Nov 10 21:58:43 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   34C    P0             81W /  350W |       0MiB /  46068MiB |     18%      Default |
+| N/A   31C    P0             78W /  350W |       0MiB /  46068MiB |     17%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4156,7 +3938,7 @@ Cell: nv | 0.24s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 24.32s
+Cell: benchmark | 25.04s
  | 
 
 Raw
@@ -4260,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      10.211ms       197.81%      10.211ms      10.211ms             1  
-                                        gpt_oss_experts        16.48%       2.023ms        99.94%      12.270ms      12.270ms       0.000us         0.00%       5.165ms       5.165ms             1  
-                                           aten::matmul         0.22%      26.489us         3.82%     468.520us      39.043us       0.000us         0.00%       4.540ms     378.357us            12  
-                                               aten::mm         2.36%     289.825us         3.60%     442.031us      36.836us       4.540ms        87.96%       4.540ms     378.357us            12  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.078ms        59.62%       3.078ms     341.948us             9  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.457ms        28.23%       1.457ms     485.813us             3  
-                                              aten::mul         1.42%     174.948us         2.34%     287.701us      11.988us     109.119us         2.11%     109.119us       4.547us            24  
-                                              aten::add         1.61%     197.786us         3.85%     472.357us      26.242us     103.039us         2.00%     103.039us       5.724us            18  
-                                            aten::index         1.73%     212.127us         2.86%     350.900us      29.242us      86.591us         1.68%      86.591us       7.216us            12  
-                                       aten::index_add_         0.51%      62.499us         0.79%      97.312us      16.219us      82.688us         1.60%      82.688us      13.781us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      82.688us         1.60%      82.688us      13.781us             6  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      80.511us         1.56%      80.511us       6.709us            12  
-                                          aten::nonzero         2.20%     270.146us         6.58%     808.380us      89.820us      63.743us         1.23%      74.368us       8.263us             9  
-                                            aten::clamp         0.98%     120.045us         1.63%     200.026us      16.669us      64.705us         1.25%      64.705us       5.392us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      64.705us         1.25%      64.705us       5.392us            12  
-                                            aten::where         0.06%       7.400us         5.25%     644.007us     107.334us       0.000us         0.00%      60.384us      10.064us             6  
-                                    aten::nonzero_numpy         0.11%      13.320us         5.19%     636.607us     106.101us       0.000us         0.00%      60.384us      10.064us             6  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      60.063us         1.16%      60.063us      10.011us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      56.800us         1.10%      56.800us       4.733us            12  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      50.911us         0.99%      50.911us       1.131us            45  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      10.360ms       190.98%      10.360ms      10.360ms             1  
+                                        gpt_oss_experts        15.12%       1.924ms        99.94%      12.713ms      12.713ms       0.000us         0.00%       5.428ms       5.428ms             1  
+                                           aten::matmul         0.18%      22.311us         3.73%     473.846us      39.487us       0.000us         0.00%       4.800ms     400.041us            12  
+                                               aten::mm         2.34%     297.100us         3.55%     451.535us      37.628us       4.800ms        88.50%       4.800ms     400.041us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.258ms        60.07%       3.258ms     362.028us             9  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.536ms        28.31%       1.536ms     511.862us             3  
+                                              aten::mul         1.29%     163.978us         2.14%     271.630us      11.318us     109.411us         2.02%     109.411us       4.559us            24  
+                                              aten::add         1.51%     192.130us         3.80%     483.423us      26.857us     103.358us         1.91%     103.358us       5.742us            18  
+                                            aten::index         1.52%     193.374us         2.62%     333.164us      27.764us      88.224us         1.63%      88.224us       7.352us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      80.864us         1.49%      80.864us       6.739us            12  
+                                       aten::index_add_         0.46%      58.130us         0.76%      97.241us      16.207us      80.064us         1.48%      80.064us      13.344us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      80.064us         1.48%      80.064us      13.344us             6  
+                                          aten::nonzero         2.05%     260.439us         6.29%     799.492us      88.832us      65.278us         1.20%      76.093us       8.455us             9  
+                                            aten::clamp         0.99%     126.442us         1.60%     203.852us      16.988us      63.456us         1.17%      63.456us       5.288us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      63.456us         1.17%      63.456us       5.288us            12  
+                                            aten::where         0.06%       7.391us         5.01%     637.190us     106.198us       0.000us         0.00%      61.533us      10.256us             6  
+                                    aten::nonzero_numpy         0.09%      11.880us         4.95%     629.799us     104.967us       0.000us         0.00%      61.533us      10.256us             6  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      60.544us         1.12%      60.544us      10.091us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      56.929us         1.05%      56.929us       4.744us            12  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      51.073us         0.94%      51.073us       1.135us            45  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.278ms
-Self CUDA time total: 5.162ms
+Self CPU time total: 12.720ms
+Self CUDA time total: 5.425ms
 
 
 
@@ -4292,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      13.933ms       229.38%      13.933ms      13.933ms             1  
-                                        gpt_oss_experts        16.29%       2.560ms        99.97%      15.712ms      15.712ms       0.000us         0.00%       6.077ms       6.077ms             1  
-                                           aten::matmul         0.30%      47.223us         5.17%     812.581us      33.858us       0.000us         0.00%       5.268ms     219.512us            24  
-                                               aten::mm         3.09%     485.951us         4.87%     765.358us      31.890us       5.268ms        86.73%       5.268ms     219.512us            24  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.213ms        85.81%       5.213ms     217.198us            24  
-                                          aten::nonzero         2.45%     385.408us         7.89%       1.240ms      82.649us     112.163us         1.85%     134.498us       8.967us            15  
-                                              aten::mul         2.03%     318.275us         3.36%     528.222us      11.005us     130.496us         2.15%     130.496us       2.719us            48  
-                                              aten::add         2.25%     353.820us         3.74%     587.771us      16.327us     127.072us         2.09%     127.072us       3.530us            36  
-                                            aten::where         0.08%      11.882us         7.49%       1.177ms      98.080us       0.000us         0.00%     120.705us      10.059us            12  
-                                    aten::nonzero_numpy         0.15%      24.083us         7.41%       1.165ms      97.090us       0.000us         0.00%     120.705us      10.059us            12  
-                                            aten::index         2.31%     363.442us         3.93%     617.030us      25.710us     110.145us         1.81%     110.145us       4.589us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     101.312us         1.67%     101.312us       4.221us            24  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      91.447us         1.51%      91.447us       1.051us            87  
-                                            aten::clamp         1.32%     207.076us         2.26%     355.011us      14.792us      85.793us         1.41%      85.793us       3.575us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      85.793us         1.41%      85.793us       3.575us            24  
-                                             aten::item         0.52%      81.620us        38.60%       6.066ms      84.255us       0.000us         0.00%      75.446us       1.048us            72  
-                              aten::_local_scalar_dense         2.00%     315.046us        38.08%       5.985ms      83.122us      75.446us         1.24%      75.446us       1.048us            72  
-                                       aten::index_add_         0.75%     118.511us         1.16%     182.084us      15.174us      72.926us         1.20%      72.926us       6.077us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      72.926us         1.20%      72.926us       6.077us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      65.857us         1.08%      65.857us       5.488us            12  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      13.942ms       218.38%      13.942ms      13.942ms             1  
+                                        gpt_oss_experts        15.57%       2.499ms        99.97%      16.048ms      16.048ms       0.000us         0.00%       6.387ms       6.387ms             1  
+                                           aten::matmul         0.25%      39.461us         4.79%     769.170us      32.049us       0.000us         0.00%       5.570ms     232.102us            24  
+                                               aten::mm         2.77%     444.894us         4.55%     729.709us      30.405us       5.570ms        87.25%       5.570ms     232.102us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.515ms        86.38%       5.515ms     229.794us            24  
+                                          aten::nonzero         2.34%     374.919us         7.60%       1.220ms      81.308us     114.786us         1.80%     137.349us       9.157us            15  
+                                              aten::mul         1.86%     298.668us         3.09%     496.508us      10.344us     131.614us         2.06%     131.614us       2.742us            48  
+                                              aten::add         2.06%     330.439us         3.47%     556.980us      15.472us     127.904us         2.00%     127.904us       3.553us            36  
+                                            aten::where         0.07%      11.120us         7.17%       1.151ms      95.939us       0.000us         0.00%     123.109us      10.259us            12  
+                                    aten::nonzero_numpy         0.13%      20.771us         7.10%       1.140ms      95.012us       0.000us         0.00%     123.109us      10.259us            12  
+                                            aten::index         2.15%     344.365us         3.72%     597.667us      24.903us     111.391us         1.74%     111.391us       4.641us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     101.985us         1.60%     101.985us       4.249us            24  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      91.395us         1.43%      91.395us       1.051us            87  
+                                            aten::clamp         1.30%     208.833us         2.21%     355.215us      14.801us      88.257us         1.38%      88.257us       3.677us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      88.257us         1.38%      88.257us       3.677us            24  
+                                             aten::item         0.49%      78.042us        39.66%       6.367ms      88.433us       0.000us         0.00%      75.297us       1.046us            72  
+                              aten::_local_scalar_dense         1.92%     308.797us        39.18%       6.289ms      87.349us      75.297us         1.18%      75.297us       1.046us            72  
+                                       aten::index_add_         0.59%      94.029us         0.99%     158.640us      13.220us      71.454us         1.12%      71.454us       5.954us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      71.454us         1.12%      71.454us       5.954us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      66.271us         1.04%      66.271us       5.523us            12  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 15.717ms
-Self CUDA time total: 6.074ms
+Self CPU time total: 16.053ms
+Self CUDA time total: 6.384ms
 
 
 
@@ -4324,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      12.540ms       148.48%      12.540ms      12.540ms             1  
-                                        gpt_oss_experts        11.83%       1.734ms        99.96%      14.654ms      14.654ms       0.000us         0.00%       8.451ms       8.451ms             1  
-                                           aten::matmul         0.16%      23.602us         3.00%     439.592us      36.633us       0.000us         0.00%       7.417ms     618.087us            12  
-                                               aten::mm         1.78%     261.037us         2.84%     415.990us      34.666us       7.417ms        87.82%       7.417ms     618.087us            12  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       4.532ms        53.65%       4.532ms     755.263us             6  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.475ms        17.46%       1.475ms     491.509us             3  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.405ms        16.64%       1.405ms     468.490us             3  
-                                              aten::mul         1.05%     153.262us         1.78%     261.173us      10.882us     197.791us         2.34%     197.791us       8.241us            24  
-                                              aten::add         1.26%     184.574us         2.07%     304.007us      16.889us     188.543us         2.23%     188.543us      10.475us            18  
-                                       aten::index_add_         0.35%      50.951us         0.57%      83.553us      13.925us     169.408us         2.01%     169.408us      28.235us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     169.408us         2.01%     169.408us      28.235us             6  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     149.663us         1.77%     149.663us      12.472us            12  
-                                            aten::index         1.27%     186.102us         2.16%     316.927us      26.411us     146.942us         1.74%     146.942us      12.245us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     117.440us         1.39%     117.440us      19.573us             6  
-                                            aten::clamp         0.71%     104.743us         1.22%     178.924us      14.910us     110.912us         1.31%     110.912us       9.243us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     110.912us         1.31%     110.912us       9.243us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.864us         1.24%     104.864us       8.739us            12  
-                                          aten::nonzero         1.58%     232.211us         4.94%     724.348us      80.483us      69.633us         0.82%      81.377us       9.042us             9  
-                                            aten::where         0.04%       6.259us         4.08%     597.684us      99.614us       0.000us         0.00%      66.816us      11.136us             6  
-                                    aten::nonzero_numpy         0.08%      11.999us         4.03%     591.425us      98.571us       0.000us         0.00%      66.816us      11.136us             6  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      12.597ms       146.28%      12.597ms      12.597ms             1  
+                                        gpt_oss_experts        11.26%       1.671ms        99.96%      14.835ms      14.835ms       0.000us         0.00%       8.616ms       8.616ms             1  
+                                           aten::matmul         0.13%      19.980us         2.85%     423.596us      35.300us       0.000us         0.00%       7.614ms     634.486us            12  
+                                               aten::mm         1.70%     251.563us         2.72%     403.616us      33.635us       7.614ms        88.42%       7.614ms     634.486us            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       4.628ms        53.74%       4.628ms     771.312us             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.524ms        17.70%       1.524ms     508.107us             3  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.455ms        16.90%       1.455ms     485.046us             3  
+                                              aten::mul         1.00%     148.488us         1.71%     253.960us      10.582us     188.737us         2.19%     188.737us       7.864us            24  
+                                              aten::add         1.14%     169.821us         1.97%     292.395us      16.244us     180.606us         2.10%     180.606us      10.034us            18  
+                                       aten::index_add_         0.32%      47.691us         0.57%      84.001us      14.000us     164.000us         1.90%     164.000us      27.333us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     164.000us         1.90%     164.000us      27.333us             6  
+                                            aten::index         1.23%     181.951us         2.12%     314.145us      26.179us     144.608us         1.68%     144.608us      12.051us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     142.815us         1.66%     142.815us      11.901us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     114.816us         1.33%     114.816us      19.136us             6  
+                                            aten::clamp         0.72%     107.083us         1.24%     184.134us      15.345us     106.818us         1.24%     106.818us       8.902us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     106.818us         1.24%     106.818us       8.902us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     100.513us         1.17%     100.513us       8.376us            12  
+                                          aten::nonzero         1.51%     224.830us         4.84%     718.263us      79.807us      68.894us         0.80%      80.029us       8.892us             9  
+                                            aten::where         0.04%       5.681us         3.95%     586.411us      97.735us       0.000us         0.00%      65.405us      10.901us             6  
+                                    aten::nonzero_numpy         0.07%      10.160us         3.91%     580.730us      96.788us       0.000us         0.00%      65.405us      10.901us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 14.659ms
-Self CUDA time total: 8.446ms
+Self CPU time total: 14.841ms
+Self CUDA time total: 8.611ms
 
 
 
@@ -4356,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      18.317ms       174.31%      18.317ms      18.317ms             1  
-                                        gpt_oss_experts        13.54%       2.761ms        99.97%      20.385ms      20.385ms       0.000us         0.00%      10.514ms      10.514ms             1  
-                                           aten::matmul         0.23%      47.082us         4.02%     819.853us      34.161us       0.000us         0.00%       9.237ms     384.865us            24  
-                                               aten::mm         2.37%     482.255us         3.79%     772.771us      32.199us       9.237ms        87.90%       9.237ms     384.865us            24  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       6.282ms        59.78%       6.282ms     349.001us            18  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.944ms        28.01%       2.944ms     490.655us             6  
-                                              aten::mul         1.50%     305.331us         2.55%     520.818us      10.850us     235.298us         2.24%     235.298us       4.902us            48  
-                                              aten::add         1.72%     351.113us         2.86%     584.036us      16.223us     213.502us         2.03%     213.502us       5.931us            36  
-                                            aten::index         1.95%     397.314us         3.28%     668.454us      27.852us     205.349us         1.95%     205.349us       8.556us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     166.720us         1.59%     166.720us       6.947us            24  
-                                       aten::index_add_         0.50%     101.340us         0.81%     165.573us      13.798us     155.585us         1.48%     155.585us      12.965us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     155.585us         1.48%     155.585us      12.965us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     146.947us         1.40%     146.947us      12.246us            12  
-                                          aten::nonzero         1.95%     398.176us         6.26%       1.276ms      85.090us     121.380us         1.16%     145.668us       9.711us            15  
-                                            aten::clamp         1.04%     212.193us         1.79%     365.180us      15.216us     134.239us         1.28%     134.239us       5.593us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.239us         1.28%     134.239us       5.593us            24  
-                                            aten::where         0.06%      11.340us         5.97%       1.216ms     101.373us       0.000us         0.00%     131.522us      10.960us            12  
-                                    aten::nonzero_numpy         0.12%      24.140us         5.91%       1.205ms     100.428us       0.000us         0.00%     131.522us      10.960us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     119.840us         1.14%     119.840us       4.993us            24  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     100.830us         0.96%     100.830us       1.159us            87  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      18.460ms       171.74%      18.460ms      18.460ms             1  
+                                        gpt_oss_experts        12.58%       2.618ms        99.97%      20.806ms      20.806ms       0.000us         0.00%      10.754ms      10.754ms             1  
+                                           aten::matmul         0.19%      39.724us         3.85%     801.313us      33.388us       0.000us         0.00%       9.496ms     395.681us            24  
+                                               aten::mm         2.21%     460.813us         3.66%     761.589us      31.733us       9.496ms        88.35%       9.496ms     395.681us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       6.491ms        60.39%       6.491ms     360.603us            18  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.993ms        27.84%       2.993ms     498.774us             6  
+                                              aten::mul         2.25%     467.369us         3.28%     683.452us      14.239us     226.014us         2.10%     226.014us       4.709us            48  
+                                              aten::add         1.60%     332.210us         2.74%     569.351us      15.815us     207.013us         1.93%     207.013us       5.750us            36  
+                                            aten::index         1.72%     357.427us         2.99%     622.664us      25.944us     203.329us         1.89%     203.329us       8.472us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     162.243us         1.51%     162.243us       6.760us            24  
+                                       aten::index_add_         0.45%      94.395us         0.78%     161.485us      13.457us     155.167us         1.44%     155.167us      12.931us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     155.167us         1.44%     155.167us      12.931us            12  
+                                          aten::nonzero         1.86%     386.184us         6.07%       1.263ms      84.202us     120.989us         1.13%     144.894us       9.660us            15  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     144.769us         1.35%     144.769us      12.064us            12  
+                                            aten::where         0.05%      10.779us         5.71%       1.188ms      99.031us       0.000us         0.00%     130.270us      10.856us            12  
+                                    aten::nonzero_numpy         0.10%      20.452us         5.66%       1.178ms      98.133us       0.000us         0.00%     130.270us      10.856us            12  
+                                            aten::clamp         1.04%     217.185us         1.79%     373.407us      15.559us     129.252us         1.20%     129.252us       5.386us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     129.252us         1.20%     129.252us       5.386us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.584us         1.08%     115.584us       4.816us            24  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     107.234us         1.00%     107.234us       1.233us            87  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 20.390ms
-Self CUDA time total: 10.509ms
+Self CPU time total: 20.812ms
+Self CUDA time total: 10.749ms
 
 
 
@@ -4388,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      21.031ms       119.92%      21.031ms      21.031ms             1  
-                                        gpt_oss_experts         7.59%       1.747ms        99.98%      23.024ms      23.024ms       0.000us         0.00%      17.548ms      17.548ms             1  
-                                           aten::matmul         0.10%      23.660us         1.94%     446.020us      37.168us       0.000us         0.00%      14.659ms       1.222ms            12  
-                                               aten::mm         1.17%     268.524us         1.83%     422.360us      35.197us      14.659ms        83.59%      14.659ms       1.222ms            12  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       8.967ms        51.13%       8.967ms       1.495ms             6  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.685ms        32.42%       5.685ms     947.562us             6  
-                                              aten::add         0.82%     187.722us         1.36%     312.616us      17.368us     785.408us         4.48%     785.408us      43.634us            18  
-                                              aten::mul         0.68%     156.369us         1.15%     264.222us      11.009us     674.688us         3.85%     674.688us      28.112us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     504.575us         2.88%     504.575us      42.048us            12  
-                                       aten::index_add_         0.22%      50.951us         0.37%      86.132us      14.355us     448.545us         2.56%     448.545us      74.757us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     448.545us         2.56%     448.545us      74.757us             6  
-                                            aten::clamp         0.46%     107.053us         0.80%     183.295us      15.275us     336.000us         1.92%     336.000us      28.000us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     336.000us         1.92%     336.000us      28.000us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     314.239us         1.79%     314.239us      52.373us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     280.833us         1.60%     280.833us      46.806us             6  
-                                            aten::index         0.81%     185.806us         1.39%     320.548us      26.712us     259.102us         1.48%     259.102us      21.592us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     258.944us         1.48%     258.944us      21.579us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     225.407us         1.29%     225.407us      37.568us             6  
-                                          aten::sigmoid         0.16%      36.131us         0.27%      61.901us      10.317us     175.073us         1.00%     175.073us      29.179us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     175.073us         1.00%     175.073us      29.179us             6  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      21.083ms       119.21%      21.083ms      21.083ms             1  
+                                        gpt_oss_experts         7.12%       1.665ms        99.98%      23.365ms      23.365ms       0.000us         0.00%      17.695ms      17.695ms             1  
+                                           aten::matmul         0.09%      20.129us         1.89%     441.429us      36.786us       0.000us         0.00%      14.828ms       1.236ms            12  
+                                               aten::mm         1.11%     260.517us         1.80%     421.300us      35.108us      14.828ms        83.84%      14.828ms       1.236ms            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.047ms        51.15%       9.047ms       1.508ms             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.773ms        32.64%       5.773ms     962.167us             6  
+                                              aten::add         0.74%     174.025us         1.27%     296.156us      16.453us     776.579us         4.39%     776.579us      43.143us            18  
+                                              aten::mul         0.64%     149.555us         1.10%     257.226us      10.718us     654.338us         3.70%     654.338us      27.264us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     499.874us         2.83%     499.874us      41.656us            12  
+                                       aten::index_add_         0.21%      48.400us         0.36%      84.241us      14.040us     449.985us         2.54%     449.985us      74.998us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     449.985us         2.54%     449.985us      74.998us             6  
+                                            aten::clamp         0.46%     107.321us         0.79%     185.253us      15.438us     329.054us         1.86%     329.054us      27.421us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     329.054us         1.86%     329.054us      27.421us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     300.737us         1.70%     300.737us      50.123us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     276.705us         1.56%     276.705us      46.117us             6  
+                                            aten::index         0.76%     178.051us         1.32%     309.462us      25.788us     268.800us         1.52%     268.800us      22.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     253.889us         1.44%     253.889us      21.157us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     236.095us         1.33%     236.095us      39.349us             6  
+                                          aten::sigmoid         0.16%      36.571us         0.27%      63.572us      10.595us     176.833us         1.00%     176.833us      29.472us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     176.833us         1.00%     176.833us      29.472us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 23.030ms
-Self CUDA time total: 17.537ms
+Self CPU time total: 23.371ms
+Self CUDA time total: 17.686ms
 
 
 
@@ -4420,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      24.377ms       140.11%      24.377ms      24.377ms             1  
-                                        gpt_oss_experts        10.50%       2.651ms        99.98%      25.237ms      25.237ms       0.000us         0.00%      17.408ms      17.408ms             1  
-                                           aten::matmul         0.19%      47.519us         3.41%     860.801us      35.867us       0.000us         0.00%      15.185ms     632.705us            24  
-                                               aten::mm         2.06%     521.061us         3.22%     813.282us      33.887us      15.185ms        87.28%      15.185ms     632.705us            24  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.179ms        52.76%       9.179ms     764.922us            12  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.124ms        17.96%       3.124ms     520.682us             6  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.871ms        16.50%       2.871ms     478.432us             6  
-                                              aten::add         1.42%     359.495us         2.37%     598.003us      16.611us     427.713us         2.46%     427.713us      11.881us            36  
-                                              aten::mul         1.23%     309.946us         2.09%     527.073us      10.981us     420.510us         2.42%     420.510us       8.761us            48  
-                                       aten::index_add_         0.40%     101.283us         0.66%     166.886us      13.907us     383.489us         2.20%     383.489us      31.957us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     383.489us         2.20%     383.489us      31.957us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     343.712us         1.98%     343.712us      14.321us            24  
-                                            aten::index         1.56%     393.991us         2.62%     662.158us      27.590us     337.086us         1.94%     337.086us      14.045us            24  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     272.926us         1.57%     272.926us      22.744us            12  
-                                            aten::clamp         0.84%     212.993us         1.44%     363.038us      15.127us     230.431us         1.32%     230.431us       9.601us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     230.431us         1.32%     230.431us       9.601us            24  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     223.071us         1.28%     223.071us       9.295us            24  
-                                          aten::nonzero         1.57%     395.401us         5.00%       1.262ms      84.127us     128.836us         0.74%     156.164us      10.411us            15  
-                                            aten::where         0.05%      12.011us         4.77%       1.205ms     100.378us       0.000us         0.00%     140.900us      11.742us            12  
-                                    aten::nonzero_numpy         0.10%      25.021us         4.72%       1.193ms      99.377us       0.000us         0.00%     140.900us      11.742us            12  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      24.709ms       139.35%      24.709ms      24.709ms             1  
+                                        gpt_oss_experts         9.76%       2.650ms        99.98%      27.156ms      27.156ms       0.000us         0.00%      17.741ms      17.741ms             1  
+                                           aten::matmul         0.15%      40.162us         3.17%     860.144us      35.839us       0.000us         0.00%      15.537ms     647.383us            24  
+                                               aten::mm         1.90%     517.331us         3.02%     819.982us      34.166us      15.537ms        87.63%      15.537ms     647.383us            24  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.352ms        52.74%       9.352ms     779.317us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.225ms        18.19%       3.225ms     537.452us             6  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.947ms        16.62%       2.947ms     491.169us             6  
+                                              aten::add         1.29%     349.077us         2.22%     601.999us      16.722us     419.552us         2.37%     419.552us      11.654us            36  
+                                              aten::mul         1.15%     311.953us         1.98%     539.014us      11.229us     410.371us         2.31%     410.371us       8.549us            48  
+                                       aten::index_add_         0.36%      97.270us         0.61%     164.412us      13.701us     379.682us         2.14%     379.682us      31.640us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     379.682us         2.14%     379.682us      31.640us            12  
+                                            aten::index         1.31%     354.897us         2.36%     641.129us      26.714us     344.639us         1.94%     344.639us      14.360us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     337.056us         1.90%     337.056us      14.044us            24  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     280.607us         1.58%     280.607us      23.384us            12  
+                                            aten::clamp         0.78%     212.661us         1.36%     368.626us      15.359us     225.662us         1.27%     225.662us       9.403us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     225.662us         1.27%     225.662us       9.403us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     218.112us         1.23%     218.112us       9.088us            24  
+                                          aten::nonzero         1.41%     383.824us         4.68%       1.271ms      84.702us     127.715us         0.72%     153.604us      10.240us            15  
+                                            aten::where         0.04%      11.073us         4.43%       1.203ms     100.252us       0.000us         0.00%     138.052us      11.504us            12  
+                                    aten::nonzero_numpy         0.07%      20.230us         4.39%       1.192ms      99.329us       0.000us         0.00%     138.052us      11.504us            12  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 25.242ms
-Self CUDA time total: 17.398ms
+Self CPU time total: 27.162ms
+Self CUDA time total: 17.731ms
 
 
 
@@ -4452,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.556ms       109.47%      40.556ms      40.556ms             1  
-                                        gpt_oss_experts         4.33%       1.794ms        99.85%      41.353ms      41.353ms       0.000us         0.00%      37.080ms      37.080ms             1  
-                                           aten::matmul         0.06%      24.371us         1.08%     445.903us      37.159us       0.000us         0.00%      27.082ms       2.257ms            12  
-                                               aten::mm         0.70%     291.738us         1.02%     421.532us      35.128us      27.082ms        73.10%      27.082ms       2.257ms            12  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      27.079ms        73.09%      27.079ms       2.257ms            12  
-                                              aten::mul         0.38%     159.199us         0.65%     268.178us      11.174us       2.983ms         8.05%       2.983ms     124.287us            24  
-                                              aten::add         0.48%     198.424us         1.09%     451.763us      25.098us       2.404ms         6.49%       2.404ms     133.559us            18  
-                                            aten::clamp         0.27%     112.290us         0.46%     189.433us      15.786us       2.392ms         6.46%       2.392ms     199.373us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.392ms         6.46%       2.392ms     199.373us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.988ms         5.37%       1.988ms     165.669us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.629ms         4.40%       1.629ms     135.763us            12  
-                                       aten::index_add_         0.12%      50.103us         0.20%      84.453us      14.076us     899.456us         2.43%     899.456us     149.909us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     899.456us         2.43%     899.456us     149.909us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     774.912us         2.09%     774.912us     129.152us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     733.217us         1.98%     733.217us     122.203us             6  
-                                            aten::index         0.45%     187.302us         0.77%     318.787us      26.566us     712.767us         1.92%     712.767us      59.397us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     678.496us         1.83%     678.496us     113.083us             6  
-                                          aten::sigmoid         0.09%      36.082us         0.15%      63.023us      10.504us     323.008us         0.87%     323.008us      53.835us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     323.008us         0.87%     323.008us      53.835us             6  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     261.631us         0.71%     261.631us      43.605us             6  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.750ms       109.03%      40.750ms      40.750ms             1  
+                                        gpt_oss_experts         4.08%       1.695ms        99.82%      41.512ms      41.512ms       0.000us         0.00%      37.407ms      37.407ms             1  
+                                           aten::matmul         0.05%      20.951us         1.02%     424.118us      35.343us       0.000us         0.00%      27.409ms       2.284ms            12  
+                                               aten::mm         0.67%     277.566us         0.97%     403.167us      33.597us      27.409ms        73.34%      27.409ms       2.284ms            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      27.406ms        73.33%      27.406ms       2.284ms            12  
+                                              aten::mul         0.37%     154.550us         0.63%     261.852us      10.911us       2.976ms         7.96%       2.976ms     124.014us            24  
+                                              aten::add         0.45%     185.160us         1.07%     445.895us      24.772us       2.401ms         6.42%       2.401ms     133.369us            18  
+                                            aten::clamp         0.28%     116.599us         0.48%     198.482us      16.540us       2.391ms         6.40%       2.391ms     199.291us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.391ms         6.40%       2.391ms     199.291us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.983ms         5.30%       1.983ms     165.222us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.625ms         4.35%       1.625ms     135.419us            12  
+                                       aten::index_add_         0.12%      48.080us         0.21%      86.751us      14.459us     910.402us         2.44%     910.402us     151.734us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     910.402us         2.44%     910.402us     151.734us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     775.618us         2.08%     775.618us     129.270us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     740.611us         1.98%     740.611us     123.435us             6  
+                                            aten::index         0.44%     181.234us         0.76%     317.848us      26.487us     714.884us         1.91%     714.884us      59.574us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     681.379us         1.82%     681.379us     113.563us             6  
+                                          aten::sigmoid         0.09%      38.611us         0.16%      65.922us      10.987us     320.927us         0.86%     320.927us      53.488us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     320.927us         0.86%     320.927us      53.488us             6  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     253.057us         0.68%     253.057us      42.176us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 41.415ms
-Self CUDA time total: 37.046ms
+Self CPU time total: 41.585ms
+Self CUDA time total: 37.374ms
 
 
 
@@ -4484,55 +4266,54 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      41.050ms       117.27%      41.050ms      41.050ms             1  
-                                        gpt_oss_experts         6.46%       2.709ms        99.99%      41.912ms      41.912ms       0.000us         0.00%      35.025ms      35.025ms             1  
-                                           aten::matmul         0.11%      47.590us         2.12%     888.873us      37.036us       0.000us         0.00%      29.051ms       1.210ms            24  
-                                               aten::mm         1.28%     536.727us         2.01%     841.283us      35.053us      29.051ms        82.99%      29.051ms       1.210ms            24  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      20.585ms        58.81%      20.585ms       1.372ms            15  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.453ms        24.15%       8.453ms     939.204us             9  
-                                              aten::add         0.88%     367.610us         1.45%     609.056us      16.918us       1.486ms         4.24%       1.486ms      41.264us            36  
-                                              aten::mul         0.74%     309.128us         1.24%     518.283us      10.798us       1.380ms         3.94%       1.380ms      28.757us            48  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     925.695us         2.64%     925.695us      38.571us            24  
-                                       aten::index_add_         0.24%      99.111us         0.40%     167.273us      13.939us     903.487us         2.58%     903.487us      75.291us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     903.487us         2.58%     903.487us      75.291us            12  
-                                            aten::clamp         0.51%     214.986us         0.87%     364.790us      15.200us     775.806us         2.22%     775.806us      32.325us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     775.806us         2.22%     775.806us      32.325us            24  
-                                            aten::index         0.89%     373.269us         1.50%     629.207us      26.217us     670.881us         1.92%     670.881us      27.953us            24  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     631.200us         1.80%     631.200us      52.600us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     600.224us         1.71%     600.224us      50.019us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     559.808us         1.60%     559.808us      46.651us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     540.611us         1.54%     540.611us      22.525us            24  
-                                          aten::sigmoid         0.17%      72.182us         0.29%     123.582us      10.298us     351.039us         1.00%     351.039us      29.253us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     351.039us         1.00%     351.039us      29.253us            12  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      41.218ms       116.52%      41.218ms      41.218ms             1  
+                                        gpt_oss_experts         6.00%       2.524ms        99.99%      42.088ms      42.088ms       0.000us         0.00%      35.395ms      35.395ms             1  
+                                           aten::matmul         0.10%      40.160us         2.08%     875.043us      36.460us       0.000us         0.00%      29.436ms       1.226ms            24  
+                                               aten::mm         1.24%     520.099us         1.98%     834.883us      34.787us      29.436ms        83.21%      29.436ms       1.226ms            24  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      20.785ms        58.75%      20.785ms       1.386ms            15  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.635ms        24.41%       8.635ms     959.410us             9  
+                                              aten::add         0.83%     349.812us         1.43%     602.505us      16.736us       1.482ms         4.19%       1.482ms      41.161us            36  
+                                              aten::mul         0.72%     302.661us         1.25%     525.878us      10.956us       1.369ms         3.87%       1.369ms      28.527us            48  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     928.163us         2.62%     928.163us      38.673us            24  
+                                       aten::index_add_         0.23%      95.791us         0.40%     170.382us      14.198us     908.198us         2.57%     908.198us      75.683us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     908.198us         2.57%     908.198us      75.683us            12  
+                                            aten::clamp         0.52%     220.263us         0.90%     378.355us      15.765us     771.551us         2.18%     771.551us      32.148us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     771.551us         2.18%     771.551us      32.148us            24  
+                                            aten::index         0.83%     351.191us         1.46%     613.487us      25.562us     665.121us         1.88%     665.121us      27.713us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     648.065us         1.83%     648.065us      54.005us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     594.560us         1.68%     594.560us      49.547us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     553.635us         1.57%     553.635us      46.136us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     519.010us         1.47%     519.010us      21.625us            24  
+                                          aten::sigmoid         0.17%      72.451us         0.30%     125.701us      10.475us     356.257us         1.01%     356.257us      29.688us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     356.257us         1.01%     356.257us      29.688us            12  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 41.917ms
-Self CUDA time total: 35.005ms
+Self CPU time total: 42.094ms
+Self CUDA time total: 35.375ms
 
 
 impl                     wl                  p50(ms)  ok
-gpt_oss_experts          cuda_B1_S1024_E2       3.79  True
-gpt_oss_experts          cuda_B1_S1024_E4       5.24  True
-gpt_oss_experts          cuda_B1_S512_E2        2.63  True
-gpt_oss_experts          cuda_B1_S512_E4        3.89  True
-gpt_oss_experts          cuda_B4_S1024_E2      13.28  True
-gpt_oss_experts          cuda_B4_S1024_E4      13.19  True
-gpt_oss_experts          cuda_B4_S512_E2        6.74  True
-gpt_oss_experts          cuda_B4_S512_E4        7.36  True
+gpt_oss_experts          cuda_B1_S1024_E2       3.84  True
+gpt_oss_experts          cuda_B1_S1024_E4       5.30  True
+gpt_oss_experts          cuda_B1_S512_E2        2.68  True
+gpt_oss_experts          cuda_B1_S512_E4        3.91  True
+gpt_oss_experts          cuda_B4_S1024_E2      13.35  True
+gpt_oss_experts          cuda_B4_S1024_E4      13.35  True
+gpt_oss_experts          cuda_B4_S512_E2        6.80  True
+gpt_oss_experts          cuda_B4_S512_E4        7.46  True
 
▶ UV Install Logs
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 33%|███▎ | 2/6 [00:00<00:00, 16.13it/s] -Fetching 6 files: 67%|██████▋ | 4/6 [00:00<00:00, 7.33it/s] -Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 11.97it/s]
+Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 3.54it/s] +Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 7.08it/s]

Artifacts:

openai_moe.jsonl