diff --git "a/openai_moe/impls/binned_torch.html" "b/openai_moe/impls/binned_torch.html" --- "a/openai_moe/impls/binned_torch.html" +++ "b/openai_moe/impls/binned_torch.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:00:34 2025 +Mon Nov 10 21:58:43 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 34C P0 81W / 350W | 0MiB / 46068MiB | 18% Default | +| N/A 31C P0 78W / 350W | 0MiB / 46068MiB | 17% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4155,7 +3937,7 @@ Cell: nv | 0.24s ▼ output ▶ uv-logs | -Cell: benchmark | 727.85s +Cell: benchmark | 727.18s | Raw @@ -4313,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 906.550ms 1808.50% 906.550ms 906.550ms 1 - binned_torch 25.29% 229.728ms 100.00% 908.308ms 908.308ms 0.000us 0.00% 50.129ms 50.129ms 1 - aten::item 1.81% 16.434ms 25.66% 233.033ms 15.186us 0.000us 0.00% 15.809ms 1.030us 15345 - aten::_local_scalar_dense 6.08% 55.189ms 23.85% 216.599ms 14.115us 15.808ms 31.54% 15.809ms 1.030us 15345 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.808ms 31.54% 15.808ms 1.030us 15345 - aten::bmm 0.02% 187.925us 0.02% 226.636us 37.773us 7.688ms 15.34% 7.688ms 1.281ms 6 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.688ms 15.34% 7.688ms 1.281ms 6 - aten::floor_divide 5.37% 48.789ms 13.13% 119.247ms 19.409us 7.554ms 15.07% 7.554ms 1.230us 6144 - aten::copy_ 3.71% 33.699ms 9.08% 82.451ms 13.394us 6.606ms 13.18% 6.607ms 1.073us 6156 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.602ms 13.17% 6.602ms 1.073us 6153 - aten::mul 3.08% 27.972ms 5.49% 49.893ms 16.194us 4.718ms 9.41% 4.718ms 1.531us 3081 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.471ms 8.92% 4.471ms 1.456us 3072 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032ms 8.04% 4.032ms 1.312us 3072 - aten::remainder 3.03% 27.567ms 4.66% 42.309ms 13.772us 3.722ms 7.42% 3.722ms 1.212us 3072 - aten::add 2.91% 26.436ms 4.87% 44.207ms 14.575us 3.546ms 7.07% 3.546ms 1.169us 3033 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.524ms 7.03% 3.524ms 1.147us 3072 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.156ms 6.30% 3.156ms 1.042us 3030 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.964ms 3.92% 1.964ms 1.279us 1536 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.758ms 3.51% 1.758ms 1.145us 1536 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 286.305us 0.57% 286.305us 47.718us 6 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 931.122ms 1835.78% 931.122ms 931.122ms 1 + binned_torch 25.32% 236.300ms 100.00% 933.185ms 933.185ms 0.000us 0.00% 50.723ms 50.723ms 1 + aten::item 1.92% 17.916ms 25.08% 234.061ms 15.253us 0.000us 0.00% 15.750ms 1.026us 15345 + aten::_local_scalar_dense 5.72% 53.357ms 23.16% 216.145ms 14.086us 15.749ms 31.05% 15.750ms 1.026us 15345 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.749ms 31.05% 15.749ms 1.026us 15345 + aten::floor_divide 5.56% 51.923ms 13.14% 122.652ms 19.963us 7.815ms 15.41% 7.815ms 1.272us 6144 + aten::bmm 0.02% 190.442us 0.02% 231.383us 38.564us 7.780ms 15.34% 7.780ms 1.297ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.780ms 15.34% 7.780ms 1.297ms 6 + aten::copy_ 3.79% 35.401ms 9.18% 85.713ms 13.923us 6.584ms 12.98% 6.585ms 1.070us 6156 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.579ms 12.97% 6.579ms 1.069us 6153 + aten::mul 3.06% 28.578ms 5.54% 51.726ms 16.789us 4.711ms 9.29% 4.711ms 1.529us 3081 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.480ms 8.83% 4.480ms 1.458us 3072 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.161ms 8.20% 4.161ms 1.354us 3072 + aten::remainder 3.12% 29.137ms 4.83% 45.065ms 14.669us 3.840ms 7.57% 3.840ms 1.250us 3072 + aten::add 2.80% 26.083ms 4.76% 44.381ms 14.633us 3.757ms 7.41% 3.757ms 1.239us 3033 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.656ms 7.21% 3.656ms 1.190us 3072 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.366ms 6.64% 3.366ms 1.111us 3030 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.023ms 3.99% 2.023ms 1.317us 1536 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.817ms 3.58% 1.817ms 1.183us 1536 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 283.649us 0.56% 283.649us 47.275us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 908.315ms -Self CUDA time total: 50.127ms +Self CPU time total: 933.193ms +Self CUDA time total: 50.721ms @@ -4345,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 939.657ms 1760.51% 939.657ms 939.657ms 1 - binned_torch 24.72% 232.366ms 100.00% 940.175ms 940.175ms 0.000us 0.00% 53.379ms 53.379ms 1 - aten::item 1.65% 15.471ms 26.56% 249.752ms 14.748us 0.000us 0.00% 17.339ms 1.024us 16935 - aten::_local_scalar_dense 6.16% 57.893ms 24.92% 234.282ms 13.834us 17.337ms 32.48% 17.339ms 1.024us 16935 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.337ms 32.48% 17.337ms 1.024us 16935 - aten::bmm 0.02% 191.684us 0.02% 230.777us 38.463us 7.882ms 14.77% 7.882ms 1.314ms 6 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.882ms 14.77% 7.882ms 1.314ms 6 - aten::floor_divide 5.10% 47.974ms 12.37% 116.337ms 18.935us 7.540ms 14.13% 7.541ms 1.227us 6144 - aten::copy_ 3.80% 35.738ms 9.00% 84.586ms 13.740us 6.593ms 12.35% 6.595ms 1.071us 6156 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.590ms 12.35% 6.590ms 1.071us 6153 - aten::add 4.16% 39.066ms 7.01% 65.874ms 14.342us 5.113ms 9.58% 5.113ms 1.113us 4593 - aten::mul 2.92% 27.472ms 5.20% 48.883ms 15.866us 4.715ms 8.83% 4.715ms 1.530us 3081 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.472ms 8.38% 4.472ms 1.456us 3072 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.021ms 7.53% 4.021ms 1.309us 3072 - aten::remainder 2.73% 25.664ms 4.27% 40.147ms 13.069us 3.707ms 6.95% 3.707ms 1.207us 3072 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.519ms 6.59% 3.519ms 1.146us 3072 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.178ms 5.95% 3.178ms 1.049us 3030 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.958ms 3.67% 1.958ms 1.275us 1536 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.749ms 3.28% 1.749ms 1.139us 1536 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.537ms 2.88% 1.537ms 0.985us 1560 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 938.961ms 1720.32% 938.961ms 938.961ms 1 + binned_torch 25.07% 235.565ms 100.00% 939.473ms 939.473ms 0.000us 0.00% 54.589ms 54.589ms 1 + aten::item 1.76% 16.540ms 26.46% 248.589ms 14.679us 0.000us 0.00% 17.855ms 1.054us 16935 + aten::_local_scalar_dense 5.69% 53.475ms 24.70% 232.048ms 13.702us 17.853ms 32.71% 17.855ms 1.054us 16935 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.853ms 32.71% 17.853ms 1.054us 16935 + aten::bmm 0.02% 182.580us 0.02% 223.522us 37.254us 7.981ms 14.62% 7.981ms 1.330ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.981ms 14.62% 7.981ms 1.330ms 6 + aten::floor_divide 5.18% 48.644ms 12.51% 117.515ms 19.127us 7.813ms 14.31% 7.816ms 1.272us 6144 + aten::copy_ 3.69% 34.686ms 8.73% 82.032ms 13.325us 6.629ms 12.15% 6.630ms 1.077us 6156 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.626ms 12.14% 6.626ms 1.077us 6153 + aten::add 3.97% 37.266ms 6.91% 64.908ms 14.132us 5.261ms 9.64% 5.261ms 1.145us 4593 + aten::mul 2.87% 26.992ms 5.23% 49.129ms 15.946us 4.699ms 8.61% 4.699ms 1.525us 3081 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.475ms 8.20% 4.475ms 1.457us 3072 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.158ms 7.62% 4.158ms 1.353us 3072 + aten::remainder 2.85% 26.773ms 4.50% 42.318ms 13.775us 3.852ms 7.06% 3.852ms 1.254us 3072 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 6.70% 3.655ms 1.190us 3072 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.271ms 5.99% 3.271ms 1.080us 3030 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.030ms 3.72% 2.030ms 1.322us 1536 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.186us 1536 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.585ms 2.90% 1.585ms 1.016us 1560 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 940.182ms -Self CUDA time total: 53.374ms +Self CPU time total: 939.480ms +Self CUDA time total: 54.581ms @@ -4377,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.751s 1703.41% 1.751s 1.751s 1 - binned_torch 24.63% 431.727ms 100.00% 1.753s 1.753s 0.000us 0.00% 102.829ms 102.829ms 1 - aten::item 1.69% 29.621ms 25.96% 455.095ms 14.915us 0.000us 0.00% 31.387ms 1.029us 30513 - aten::_local_scalar_dense 5.96% 104.552ms 24.27% 425.474ms 13.944us 31.383ms 30.52% 31.387ms 1.029us 30513 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.383ms 30.52% 31.383ms 1.029us 30513 - aten::bmm 0.01% 224.614us 0.02% 267.595us 44.599us 15.143ms 14.73% 15.143ms 2.524ms 6 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.143ms 14.73% 15.143ms 2.524ms 6 - aten::floor_divide 5.56% 97.549ms 13.34% 233.779ms 19.025us 15.089ms 14.68% 15.090ms 1.228us 12288 - aten::copy_ 4.01% 70.283ms 9.47% 166.011ms 13.497us 13.317ms 12.95% 13.317ms 1.083us 12300 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.313ms 12.95% 13.313ms 1.083us 12294 - aten::mul 3.14% 55.060ms 5.66% 99.236ms 16.128us 11.295ms 10.99% 11.297ms 1.836us 6153 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.940ms 9.67% 9.940ms 1.618us 6144 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.059ms 7.84% 8.059ms 1.312us 6144 - aten::add 2.85% 49.952ms 4.90% 85.866ms 14.522us 7.505ms 7.30% 7.506ms 1.269us 5913 - aten::remainder 3.02% 53.015ms 4.74% 83.117ms 13.528us 7.414ms 7.21% 7.416ms 1.207us 6144 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.031ms 6.84% 7.031ms 1.144us 6144 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.224ms 6.05% 6.224ms 1.053us 5910 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.914ms 3.81% 3.914ms 1.274us 3072 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 3.40% 3.500ms 1.139us 3072 - aten::clamp 0.00% 71.603us 0.01% 117.833us 19.639us 1.180ms 1.15% 1.180ms 196.722us 6 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.710s 1645.94% 1.710s 1.710s 1 + binned_torch 23.47% 401.594ms 100.00% 1.711s 1.711s 0.000us 0.00% 103.932ms 103.932ms 1 + aten::item 1.77% 30.361ms 27.00% 461.971ms 15.140us 0.000us 0.00% 31.541ms 1.034us 30513 + aten::_local_scalar_dense 5.97% 102.153ms 25.22% 431.610ms 14.145us 31.538ms 30.35% 31.541ms 1.034us 30513 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.538ms 30.35% 31.538ms 1.034us 30513 + aten::floor_divide 5.77% 98.697ms 13.68% 234.018ms 19.044us 15.598ms 15.01% 15.600ms 1.270us 12288 + aten::bmm 0.01% 219.084us 0.02% 260.723us 43.454us 15.235ms 14.66% 15.235ms 2.539ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.235ms 14.66% 15.235ms 2.539ms 6 + aten::copy_ 3.97% 67.926ms 9.38% 160.451ms 13.045us 13.315ms 12.81% 13.316ms 1.083us 12300 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.311ms 12.81% 13.311ms 1.083us 12294 + aten::mul 3.19% 54.637ms 5.82% 99.678ms 16.200us 11.250ms 10.83% 11.252ms 1.829us 6153 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.903ms 9.53% 9.903ms 1.612us 6144 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.304ms 7.99% 8.304ms 1.352us 6144 + aten::remainder 3.07% 52.461ms 4.79% 82.008ms 13.348us 7.670ms 7.38% 7.671ms 1.249us 6144 + aten::add 2.76% 47.163ms 4.86% 83.106ms 14.055us 7.632ms 7.34% 7.633ms 1.291us 5913 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.294ms 7.02% 7.294ms 1.187us 6144 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.354ms 6.11% 6.354ms 1.075us 5910 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.041ms 3.89% 4.041ms 1.316us 3072 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.629ms 3.49% 3.629ms 1.181us 3072 + aten::clamp 0.00% 71.350us 0.01% 113.931us 18.988us 1.190ms 1.15% 1.190ms 198.366us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.753s -Self CUDA time total: 102.819ms +Self CPU time total: 1.711s +Self CUDA time total: 103.922ms @@ -4409,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.834s 1680.90% 1.834s 1.834s 1 - binned_torch 24.76% 454.393ms 100.00% 1.835s 1.835s 0.000us 0.00% 109.119ms 109.119ms 1 - aten::item 1.65% 30.229ms 26.42% 484.819ms 14.374us 0.000us 0.00% 34.734ms 1.030us 33729 - aten::_local_scalar_dense 6.08% 111.551ms 24.77% 454.590ms 13.478us 34.731ms 31.83% 34.734ms 1.030us 33729 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 34.731ms 31.83% 34.731ms 1.030us 33729 - aten::bmm 0.01% 219.836us 0.01% 260.868us 43.478us 15.243ms 13.97% 15.243ms 2.540ms 6 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.243ms 13.97% 15.243ms 2.540ms 6 - aten::floor_divide 5.37% 98.619ms 12.62% 231.581ms 18.846us 15.065ms 13.81% 15.065ms 1.226us 12288 - aten::copy_ 3.65% 66.986ms 8.64% 158.623ms 12.896us 13.313ms 12.20% 13.316ms 1.083us 12300 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.309ms 12.20% 13.309ms 1.082us 12297 - aten::mul 2.96% 54.365ms 5.27% 96.616ms 15.702us 10.967ms 10.05% 10.969ms 1.783us 6153 - aten::add 4.05% 74.247ms 6.97% 127.934ms 14.060us 10.631ms 9.74% 10.631ms 1.168us 9099 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.613ms 8.81% 9.613ms 1.565us 6144 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.047ms 7.37% 8.047ms 1.310us 6144 - aten::remainder 2.81% 51.641ms 4.37% 80.193ms 13.052us 7.438ms 6.82% 7.438ms 1.211us 6144 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.018ms 6.43% 7.018ms 1.142us 6144 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.225ms 5.71% 6.225ms 1.053us 5910 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.928ms 3.60% 3.928ms 1.279us 3072 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.510ms 3.22% 3.510ms 1.143us 3072 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.154ms 2.89% 3.154ms 0.990us 3186 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.831s 1659.19% 1.831s 1.831s 1 + binned_torch 23.77% 435.469ms 100.00% 1.832s 1.832s 0.000us 0.00% 110.361ms 110.361ms 1 + aten::item 1.74% 31.875ms 27.52% 504.183ms 14.948us 0.000us 0.00% 34.964ms 1.037us 33729 + aten::_local_scalar_dense 6.20% 113.521ms 25.78% 472.309ms 14.003us 34.961ms 31.68% 34.964ms 1.037us 33729 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 34.961ms 31.68% 34.961ms 1.037us 33729 + aten::floor_divide 5.21% 95.369ms 12.55% 229.877ms 18.707us 15.595ms 14.13% 15.597ms 1.269us 12288 + aten::bmm 0.01% 225.035us 0.01% 267.825us 44.638us 15.231ms 13.80% 15.231ms 2.539ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.231ms 13.80% 15.231ms 2.539ms 6 + aten::copy_ 3.69% 67.648ms 8.80% 161.241ms 13.109us 13.343ms 12.09% 13.347ms 1.085us 12300 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.340ms 12.09% 13.340ms 1.085us 12297 + aten::mul 2.99% 54.761ms 5.39% 98.799ms 16.057us 10.934ms 9.91% 10.936ms 1.777us 6153 + aten::add 3.91% 71.612ms 6.90% 126.397ms 13.891us 10.863ms 9.84% 10.863ms 1.194us 9099 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.586ms 8.69% 9.586ms 1.560us 6144 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.308ms 7.53% 8.308ms 1.352us 6144 + aten::remainder 2.81% 51.395ms 4.41% 80.796ms 13.150us 7.688ms 6.97% 7.688ms 1.251us 6144 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.287ms 6.60% 7.287ms 1.186us 6144 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.364ms 5.77% 6.364ms 1.077us 5910 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.054ms 3.67% 4.054ms 1.320us 3072 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.634ms 3.29% 3.634ms 1.183us 3072 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.232ms 2.93% 3.232ms 1.014us 3186 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.835s -Self CUDA time total: 109.111ms +Self CPU time total: 1.832s +Self CUDA time total: 110.351ms @@ -4441,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.518s 1672.53% 3.518s 3.518s 1 - binned_torch 24.37% 858.118ms 100.00% 3.521s 3.521s 0.000us 0.00% 210.357ms 210.357ms 1 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.177ms 30.04% 63.177ms 1.026us 61586 - aten::item 1.69% 59.432ms 26.02% 916.275ms 14.878us 0.000us 0.00% 63.177ms 1.026us 61587 - aten::_local_scalar_dense 5.96% 209.806ms 24.34% 856.843ms 13.913us 63.176ms 30.03% 63.177ms 1.026us 61587 - aten::floor_divide 5.42% 190.698ms 13.50% 475.217ms 19.337us 30.482ms 14.49% 30.486ms 1.240us 24576 - aten::bmm 0.01% 235.397us 0.01% 281.998us 47.000us 29.291ms 13.93% 29.291ms 4.882ms 6 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.291ms 13.93% 29.291ms 4.882ms 6 - aten::copy_ 3.77% 132.744ms 9.15% 322.282ms 13.107us 26.808ms 12.75% 26.810ms 1.090us 24588 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.805ms 12.74% 26.805ms 1.090us 24582 - aten::mul 3.15% 110.895ms 5.78% 203.457ms 16.545us 25.566ms 12.15% 25.568ms 2.079us 12297 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.101ms 10.51% 22.101ms 1.799us 12288 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.470ms 7.83% 16.470ms 1.340us 12288 - aten::add 2.99% 105.439ms 5.15% 181.211ms 14.601us 16.115ms 7.66% 16.116ms 1.298us 12411 - aten::remainder 2.99% 105.111ms 4.72% 166.195ms 13.525us 14.836ms 7.05% 14.838ms 1.208us 12288 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.014ms 6.66% 14.014ms 1.140us 12288 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 12.996ms 6.18% 12.996ms 1.047us 12408 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.830ms 3.72% 7.830ms 1.274us 6144 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.006ms 3.33% 7.006ms 1.140us 6144 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.626ms 1.25% 2.626ms 437.595us 6 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.493s 1641.52% 3.493s 3.493s 1 + binned_torch 23.72% 828.141ms 100.00% 3.492s 3.492s 0.000us 0.00% 212.777ms 212.777ms 1 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.619ms 29.90% 63.619ms 1.033us 61586 + aten::item 1.76% 61.470ms 26.76% 934.319ms 15.171us 0.000us 0.00% 63.619ms 1.033us 61587 + aten::_local_scalar_dense 5.95% 207.894ms 25.00% 872.849ms 14.173us 63.616ms 29.90% 63.619ms 1.033us 61587 + aten::floor_divide 5.53% 193.077ms 13.34% 465.879ms 18.957us 31.606ms 14.86% 31.612ms 1.286us 24576 + aten::bmm 0.01% 236.694us 0.01% 284.594us 47.432us 29.067ms 13.66% 29.067ms 4.844ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.067ms 13.66% 29.067ms 4.844ms 6 + aten::copy_ 3.89% 135.756ms 9.33% 325.881ms 13.254us 26.713ms 12.56% 26.714ms 1.086us 24588 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.711ms 12.55% 26.711ms 1.087us 24582 + aten::mul 3.15% 110.066ms 5.73% 199.944ms 16.260us 25.593ms 12.03% 25.595ms 2.081us 12297 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.131ms 10.40% 22.131ms 1.801us 12288 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.009ms 7.99% 17.009ms 1.384us 12288 + aten::add 2.82% 98.495ms 4.98% 173.932ms 14.014us 16.658ms 7.83% 16.659ms 1.342us 12411 + aten::remainder 3.04% 106.037ms 4.77% 166.563ms 13.555us 15.433ms 7.25% 15.435ms 1.256us 12288 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.597ms 6.86% 14.597ms 1.188us 12288 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.527ms 6.36% 13.527ms 1.090us 12408 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.132ms 3.82% 8.132ms 1.324us 6144 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.300ms 3.43% 7.300ms 1.188us 6144 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.623ms 1.23% 2.623ms 437.201us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.521s -Self CUDA time total: 210.342ms +Self CPU time total: 3.492s +Self CUDA time total: 212.763ms @@ -4473,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.742s 1679.57% 3.742s 3.742s 1 - binned_torch 24.42% 914.204ms 100.00% 3.744s 3.744s 0.000us 0.00% 222.834ms 222.834ms 1 - aten::item 1.73% 64.729ms 26.53% 993.125ms 14.638us 0.000us 0.00% 69.848ms 1.030us 67845 - aten::_local_scalar_dense 6.14% 229.850ms 24.80% 928.396ms 13.684us 69.844ms 31.35% 69.848ms 1.030us 67845 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.844ms 31.35% 69.844ms 1.030us 67841 - aten::floor_divide 5.29% 197.931ms 12.52% 468.921ms 19.080us 30.509ms 13.69% 30.515ms 1.242us 24576 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.140ms 13.08% 29.140ms 4.857ms 6 - aten::bmm 0.01% 232.675us 0.01% 273.538us 45.590us 29.140ms 13.08% 29.140ms 4.857ms 6 - aten::copy_ 3.66% 136.881ms 8.73% 326.908ms 13.295us 26.646ms 11.96% 26.647ms 1.084us 24588 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.643ms 11.96% 26.643ms 1.084us 24581 - aten::mul 2.96% 110.832ms 5.24% 196.253ms 15.959us 25.520ms 11.45% 25.522ms 2.075us 12297 - aten::add 4.16% 155.619ms 7.13% 266.948ms 14.322us 22.169ms 9.95% 22.169ms 1.189us 18639 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.076ms 9.91% 22.076ms 1.797us 12288 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.462ms 7.39% 16.462ms 1.340us 12287 - aten::remainder 2.77% 103.887ms 4.33% 162.240ms 13.203us 14.877ms 6.68% 14.879ms 1.211us 12288 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.047ms 6.30% 14.047ms 1.143us 12287 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 12.957ms 5.82% 12.957ms 1.044us 12407 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.856ms 3.53% 7.856ms 1.279us 6144 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.021ms 3.15% 7.021ms 1.143us 6144 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.109ms 2.74% 6.109ms 0.981us 6228 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.669s 1629.04% 3.669s 3.669s 1 + binned_torch 23.71% 870.025ms 100.00% 3.670s 3.670s 0.000us 0.00% 225.217ms 225.217ms 1 + aten::item 1.74% 63.801ms 26.98% 990.130ms 14.594us 0.000us 0.00% 69.736ms 1.028us 67845 + aten::_local_scalar_dense 5.93% 217.737ms 25.24% 926.329ms 13.654us 69.731ms 30.96% 69.736ms 1.028us 67845 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.731ms 30.96% 69.731ms 1.028us 67841 + aten::floor_divide 5.15% 189.112ms 12.36% 453.770ms 18.464us 31.523ms 14.00% 31.529ms 1.283us 24576 + aten::bmm 0.01% 229.594us 0.01% 272.075us 45.346us 28.926ms 12.84% 28.926ms 4.821ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 28.926ms 12.84% 28.926ms 4.821ms 6 + aten::copy_ 3.90% 143.149ms 8.93% 327.628ms 13.325us 26.721ms 11.87% 26.722ms 1.087us 24588 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.719ms 11.86% 26.719ms 1.087us 24581 + aten::mul 3.13% 114.822ms 5.47% 200.852ms 16.333us 25.594ms 11.37% 25.596ms 2.081us 12297 + aten::add 3.87% 141.881ms 6.78% 248.742ms 13.345us 23.243ms 10.32% 23.243ms 1.247us 18639 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.132ms 9.83% 22.132ms 1.801us 12288 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.988ms 7.54% 16.988ms 1.383us 12287 + aten::remainder 2.85% 104.729ms 4.42% 162.304ms 13.208us 15.354ms 6.82% 15.355ms 1.250us 12288 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.535ms 6.45% 14.535ms 1.183us 12287 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.676ms 6.07% 13.676ms 1.102us 12407 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.096ms 3.60% 8.096ms 1.318us 6144 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.258ms 3.22% 7.258ms 1.181us 6144 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.475ms 2.88% 6.475ms 1.040us 6228 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.744s -Self CUDA time total: 222.814ms +Self CPU time total: 3.670s +Self CUDA time total: 225.199ms @@ -4505,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 6.967s 1665.27% 6.967s 6.967s 1 - binned_torch 24.68% 1.721s 100.00% 6.973s 6.973s 0.000us 0.00% 418.392ms 418.392ms 1 - aten::item 1.64% 114.231ms 25.94% 1.809s 14.732us 0.000us 0.00% 125.163ms 1.020us 122763 - aten::_local_scalar_dense 5.97% 416.624ms 24.30% 1.694s 13.802us 125.151ms 29.91% 125.163ms 1.020us 122763 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 125.151ms 29.91% 125.151ms 1.019us 122762 - aten::floor_divide 5.62% 391.846ms 13.33% 929.253ms 18.906us 61.051ms 14.59% 61.053ms 1.242us 49152 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.281ms 13.69% 57.281ms 9.547ms 6 - aten::bmm 0.00% 234.996us 0.00% 276.787us 46.131us 57.281ms 13.69% 57.281ms 9.547ms 6 - aten::copy_ 3.92% 273.517ms 9.35% 652.240ms 13.268us 53.435ms 12.77% 53.437ms 1.087us 49158 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.433ms 12.77% 53.433ms 1.087us 49154 - aten::mul 3.15% 219.950ms 5.62% 391.612ms 15.929us 51.411ms 12.29% 51.419ms 2.091us 24585 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.451ms 10.62% 44.451ms 1.809us 24576 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 32.993ms 7.89% 32.993ms 1.343us 24576 - aten::add 2.87% 200.428ms 4.94% 344.166ms 14.085us 31.887ms 7.62% 31.889ms 1.305us 24435 - aten::remainder 3.00% 208.953ms 4.67% 325.902ms 13.261us 29.680ms 7.09% 29.684ms 1.208us 24576 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.059ms 6.71% 28.059ms 1.142us 24576 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.247ms 6.03% 25.247ms 1.033us 24431 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.667ms 3.74% 15.667ms 1.275us 12288 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.014ms 3.35% 14.014ms 1.140us 12288 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.233ms 1.25% 5.233ms 872.184us 6 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 6.859s 1611.59% 6.859s 6.859s 1 + binned_torch 24.10% 1.655s 100.00% 6.866s 6.866s 0.000us 0.00% 425.661ms 425.661ms 1 + aten::item 1.68% 115.068ms 26.29% 1.805s 14.704us 0.000us 0.00% 127.116ms 1.035us 122763 + aten::_local_scalar_dense 5.74% 393.879ms 24.61% 1.690s 13.764us 127.109ms 29.86% 127.116ms 1.035us 122763 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.110ms 29.86% 127.110ms 1.035us 122762 + aten::floor_divide 5.46% 374.656ms 13.09% 898.826ms 18.287us 63.404ms 14.90% 63.408ms 1.290us 49152 + aten::bmm 0.00% 234.973us 0.00% 276.793us 46.132us 56.971ms 13.38% 56.971ms 9.495ms 6 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.971ms 13.38% 56.971ms 9.495ms 6 + aten::copy_ 4.17% 286.167ms 9.49% 651.750ms 13.258us 53.615ms 12.60% 53.616ms 1.091us 49158 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.612ms 12.60% 53.612ms 1.091us 49154 + aten::mul 3.34% 229.543ms 5.86% 402.465ms 16.370us 51.556ms 12.11% 51.561ms 2.097us 24585 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.609ms 10.48% 44.609ms 1.815us 24576 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.184ms 8.03% 34.184ms 1.391us 24576 + aten::add 2.69% 184.813ms 4.71% 323.308ms 13.231us 33.584ms 7.89% 33.588ms 1.375us 24435 + aten::remainder 3.06% 210.055ms 4.75% 326.044ms 13.267us 30.927ms 7.27% 30.931ms 1.259us 24576 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.221ms 6.87% 29.221ms 1.189us 24576 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.946ms 6.33% 26.946ms 1.103us 24431 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.291ms 3.83% 16.291ms 1.326us 12288 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.637ms 3.44% 14.637ms 1.191us 12288 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.222ms 1.23% 5.222ms 870.407us 6 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.973s -Self CUDA time total: 418.361ms +Self CPU time total: 6.866s +Self CUDA time total: 425.634ms @@ -4537,40 +4319,40 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.368s 1660.72% 7.368s 7.368s 1 - binned_torch 24.39% 1.797s 100.00% 7.370s 7.370s 0.000us 0.00% 443.698ms 443.698ms 1 - aten::item 1.69% 124.742ms 26.51% 1.954s 14.504us 0.000us 0.00% 137.717ms 1.022us 134715 - aten::_local_scalar_dense 6.11% 450.407ms 24.82% 1.829s 13.577us 137.708ms 31.04% 137.717ms 1.022us 134715 - Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 137.710ms 31.04% 137.710ms 1.022us 134711 - aten::floor_divide 5.42% 399.563ms 12.65% 932.414ms 18.970us 61.071ms 13.77% 61.077ms 1.243us 49152 - aten::bmm 0.00% 230.664us 0.00% 272.466us 45.411us 57.304ms 12.92% 57.304ms 9.551ms 6 - ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.304ms 12.92% 57.304ms 9.551ms 6 - aten::copy_ 3.65% 269.132ms 8.67% 639.259ms 13.004us 54.065ms 12.19% 54.067ms 1.100us 49158 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.062ms 12.19% 54.062ms 1.100us 49153 - aten::mul 2.96% 217.959ms 5.26% 387.551ms 15.764us 51.653ms 11.64% 51.660ms 2.101us 24585 -void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.653ms 10.06% 44.653ms 1.817us 24576 - aten::add 4.03% 296.962ms 6.96% 512.647ms 14.100us 43.690ms 9.85% 43.694ms 1.202us 36357 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 32.954ms 7.43% 32.954ms 1.341us 24575 - aten::remainder 2.83% 208.527ms 4.40% 323.906ms 13.180us 29.662ms 6.69% 29.664ms 1.207us 24576 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.119ms 6.34% 28.119ms 1.144us 24576 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.409ms 5.73% 25.409ms 1.040us 24431 -void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.666ms 3.53% 15.666ms 1.275us 12288 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 13.995ms 3.15% 13.995ms 1.139us 12288 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.644ms 2.62% 11.644ms 0.977us 11922 + binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.331s 1630.84% 7.331s 7.331s 1 + binned_torch 23.92% 1.754s 100.00% 7.333s 7.333s 0.000us 0.00% 449.578ms 449.578ms 1 + aten::item 1.73% 127.153ms 27.44% 2.013s 14.940us 0.000us 0.00% 139.264ms 1.034us 134715 + aten::_local_scalar_dense 6.23% 456.926ms 25.71% 1.885s 13.996us 139.253ms 30.98% 139.264ms 1.034us 134715 + Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 139.255ms 30.98% 139.255ms 1.034us 134707 + aten::floor_divide 5.02% 368.091ms 12.28% 900.843ms 18.328us 63.383ms 14.10% 63.388ms 1.290us 49152 + ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.831ms 12.64% 56.831ms 9.472ms 6 + aten::bmm 0.00% 231.002us 0.00% 273.424us 45.571us 56.831ms 12.64% 56.831ms 9.472ms 6 + aten::copy_ 3.67% 268.957ms 8.71% 638.523ms 12.989us 53.771ms 11.96% 53.773ms 1.094us 49158 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.768ms 11.96% 53.768ms 1.094us 49149 + aten::mul 2.96% 217.228ms 5.34% 391.576ms 15.927us 51.518ms 11.46% 51.524ms 2.096us 24585 + aten::add 3.83% 280.607ms 6.79% 497.692ms 13.689us 45.514ms 10.12% 45.518ms 1.252us 36357 +void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.542ms 9.91% 44.542ms 1.812us 24576 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.127ms 7.59% 34.127ms 1.389us 24573 + aten::remainder 2.85% 209.203ms 4.50% 330.314ms 13.441us 30.793ms 6.85% 30.795ms 1.253us 24576 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.257ms 6.51% 29.257ms 1.191us 24573 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.610ms 5.92% 26.610ms 1.089us 24431 +void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.233ms 3.61% 16.233ms 1.321us 12288 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.559ms 3.24% 14.559ms 1.185us 12288 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.261ms 2.73% 12.261ms 1.028us 11922 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.370s -Self CUDA time total: 443.660ms +Self CPU time total: 7.333s +Self CUDA time total: 449.542ms impl wl p50(ms) ok -binned_torch cuda_B1_S1024_E2 372.79 True -binned_torch cuda_B1_S1024_E4 382.68 True -binned_torch cuda_B1_S512_E2 150.05 True -binned_torch cuda_B1_S512_E4 200.26 True -binned_torch cuda_B4_S1024_E2 1486.48 True -binned_torch cuda_B4_S1024_E4 1524.50 True -binned_torch cuda_B4_S512_E2 742.02 True -binned_torch cuda_B4_S512_E4 801.90 True +binned_torch cuda_B1_S1024_E2 367.62 True +binned_torch cuda_B1_S1024_E4 394.19 True +binned_torch cuda_B1_S512_E2 154.67 True +binned_torch cuda_B1_S512_E4 201.50 True +binned_torch cuda_B4_S1024_E2 1483.54 True +binned_torch cuda_B4_S1024_E4 1601.90 True +binned_torch cuda_B4_S512_E2 736.26 True +binned_torch cuda_B4_S512_E4 798.88 TrueArtifacts: