diff --git "a/openai_moe/impls/binned_torch.html" "b/openai_moe/impls/binned_torch.html" --- "a/openai_moe/impls/binned_torch.html" +++ "b/openai_moe/impls/binned_torch.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.22s | Raw @@ -4122,16 +3904,16 @@ Cell: nv | 0.24s
-
Fri Oct 31 20:00:34 2025       
+
Mon Nov 10 21:58:43 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   34C    P0             81W /  350W |       0MiB /  46068MiB |     18%      Default |
+| N/A   31C    P0             78W /  350W |       0MiB /  46068MiB |     17%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,7 +3937,7 @@ Cell: nv | 0.24s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 727.85s
+Cell: benchmark | 727.18s
  | 
 
 Raw
@@ -4313,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     906.550ms      1808.50%     906.550ms     906.550ms             1  
-                                           binned_torch        25.29%     229.728ms       100.00%     908.308ms     908.308ms       0.000us         0.00%      50.129ms      50.129ms             1  
-                                             aten::item         1.81%      16.434ms        25.66%     233.033ms      15.186us       0.000us         0.00%      15.809ms       1.030us         15345  
-                              aten::_local_scalar_dense         6.08%      55.189ms        23.85%     216.599ms      14.115us      15.808ms        31.54%      15.809ms       1.030us         15345  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      15.808ms        31.54%      15.808ms       1.030us         15345  
-                                              aten::bmm         0.02%     187.925us         0.02%     226.636us      37.773us       7.688ms        15.34%       7.688ms       1.281ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.688ms        15.34%       7.688ms       1.281ms             6  
-                                     aten::floor_divide         5.37%      48.789ms        13.13%     119.247ms      19.409us       7.554ms        15.07%       7.554ms       1.230us          6144  
-                                            aten::copy_         3.71%      33.699ms         9.08%      82.451ms      13.394us       6.606ms        13.18%       6.607ms       1.073us          6156  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.602ms        13.17%       6.602ms       1.073us          6153  
-                                              aten::mul         3.08%      27.972ms         5.49%      49.893ms      16.194us       4.718ms         9.41%       4.718ms       1.531us          3081  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.471ms         8.92%       4.471ms       1.456us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032ms         8.04%       4.032ms       1.312us          3072  
-                                        aten::remainder         3.03%      27.567ms         4.66%      42.309ms      13.772us       3.722ms         7.42%       3.722ms       1.212us          3072  
-                                              aten::add         2.91%      26.436ms         4.87%      44.207ms      14.575us       3.546ms         7.07%       3.546ms       1.169us          3033  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.524ms         7.03%       3.524ms       1.147us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.156ms         6.30%       3.156ms       1.042us          3030  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.964ms         3.92%       1.964ms       1.279us          1536  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.758ms         3.51%       1.758ms       1.145us          1536  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     286.305us         0.57%     286.305us      47.718us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     931.122ms      1835.78%     931.122ms     931.122ms             1  
+                                           binned_torch        25.32%     236.300ms       100.00%     933.185ms     933.185ms       0.000us         0.00%      50.723ms      50.723ms             1  
+                                             aten::item         1.92%      17.916ms        25.08%     234.061ms      15.253us       0.000us         0.00%      15.750ms       1.026us         15345  
+                              aten::_local_scalar_dense         5.72%      53.357ms        23.16%     216.145ms      14.086us      15.749ms        31.05%      15.750ms       1.026us         15345  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      15.749ms        31.05%      15.749ms       1.026us         15345  
+                                     aten::floor_divide         5.56%      51.923ms        13.14%     122.652ms      19.963us       7.815ms        15.41%       7.815ms       1.272us          6144  
+                                              aten::bmm         0.02%     190.442us         0.02%     231.383us      38.564us       7.780ms        15.34%       7.780ms       1.297ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.780ms        15.34%       7.780ms       1.297ms             6  
+                                            aten::copy_         3.79%      35.401ms         9.18%      85.713ms      13.923us       6.584ms        12.98%       6.585ms       1.070us          6156  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.579ms        12.97%       6.579ms       1.069us          6153  
+                                              aten::mul         3.06%      28.578ms         5.54%      51.726ms      16.789us       4.711ms         9.29%       4.711ms       1.529us          3081  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.480ms         8.83%       4.480ms       1.458us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.161ms         8.20%       4.161ms       1.354us          3072  
+                                        aten::remainder         3.12%      29.137ms         4.83%      45.065ms      14.669us       3.840ms         7.57%       3.840ms       1.250us          3072  
+                                              aten::add         2.80%      26.083ms         4.76%      44.381ms      14.633us       3.757ms         7.41%       3.757ms       1.239us          3033  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.656ms         7.21%       3.656ms       1.190us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.366ms         6.64%       3.366ms       1.111us          3030  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.023ms         3.99%       2.023ms       1.317us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.817ms         3.58%       1.817ms       1.183us          1536  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     283.649us         0.56%     283.649us      47.275us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 908.315ms
-Self CUDA time total: 50.127ms
+Self CPU time total: 933.193ms
+Self CUDA time total: 50.721ms
 
 
 
@@ -4345,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     939.657ms      1760.51%     939.657ms     939.657ms             1  
-                                           binned_torch        24.72%     232.366ms       100.00%     940.175ms     940.175ms       0.000us         0.00%      53.379ms      53.379ms             1  
-                                             aten::item         1.65%      15.471ms        26.56%     249.752ms      14.748us       0.000us         0.00%      17.339ms       1.024us         16935  
-                              aten::_local_scalar_dense         6.16%      57.893ms        24.92%     234.282ms      13.834us      17.337ms        32.48%      17.339ms       1.024us         16935  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      17.337ms        32.48%      17.337ms       1.024us         16935  
-                                              aten::bmm         0.02%     191.684us         0.02%     230.777us      38.463us       7.882ms        14.77%       7.882ms       1.314ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.882ms        14.77%       7.882ms       1.314ms             6  
-                                     aten::floor_divide         5.10%      47.974ms        12.37%     116.337ms      18.935us       7.540ms        14.13%       7.541ms       1.227us          6144  
-                                            aten::copy_         3.80%      35.738ms         9.00%      84.586ms      13.740us       6.593ms        12.35%       6.595ms       1.071us          6156  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.590ms        12.35%       6.590ms       1.071us          6153  
-                                              aten::add         4.16%      39.066ms         7.01%      65.874ms      14.342us       5.113ms         9.58%       5.113ms       1.113us          4593  
-                                              aten::mul         2.92%      27.472ms         5.20%      48.883ms      15.866us       4.715ms         8.83%       4.715ms       1.530us          3081  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.472ms         8.38%       4.472ms       1.456us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.021ms         7.53%       4.021ms       1.309us          3072  
-                                        aten::remainder         2.73%      25.664ms         4.27%      40.147ms      13.069us       3.707ms         6.95%       3.707ms       1.207us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.519ms         6.59%       3.519ms       1.146us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.178ms         5.95%       3.178ms       1.049us          3030  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.958ms         3.67%       1.958ms       1.275us          1536  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.749ms         3.28%       1.749ms       1.139us          1536  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.537ms         2.88%       1.537ms       0.985us          1560  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     938.961ms      1720.32%     938.961ms     938.961ms             1  
+                                           binned_torch        25.07%     235.565ms       100.00%     939.473ms     939.473ms       0.000us         0.00%      54.589ms      54.589ms             1  
+                                             aten::item         1.76%      16.540ms        26.46%     248.589ms      14.679us       0.000us         0.00%      17.855ms       1.054us         16935  
+                              aten::_local_scalar_dense         5.69%      53.475ms        24.70%     232.048ms      13.702us      17.853ms        32.71%      17.855ms       1.054us         16935  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      17.853ms        32.71%      17.853ms       1.054us         16935  
+                                              aten::bmm         0.02%     182.580us         0.02%     223.522us      37.254us       7.981ms        14.62%       7.981ms       1.330ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.981ms        14.62%       7.981ms       1.330ms             6  
+                                     aten::floor_divide         5.18%      48.644ms        12.51%     117.515ms      19.127us       7.813ms        14.31%       7.816ms       1.272us          6144  
+                                            aten::copy_         3.69%      34.686ms         8.73%      82.032ms      13.325us       6.629ms        12.15%       6.630ms       1.077us          6156  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.626ms        12.14%       6.626ms       1.077us          6153  
+                                              aten::add         3.97%      37.266ms         6.91%      64.908ms      14.132us       5.261ms         9.64%       5.261ms       1.145us          4593  
+                                              aten::mul         2.87%      26.992ms         5.23%      49.129ms      15.946us       4.699ms         8.61%       4.699ms       1.525us          3081  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.475ms         8.20%       4.475ms       1.457us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.158ms         7.62%       4.158ms       1.353us          3072  
+                                        aten::remainder         2.85%      26.773ms         4.50%      42.318ms      13.775us       3.852ms         7.06%       3.852ms       1.254us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.655ms         6.70%       3.655ms       1.190us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.271ms         5.99%       3.271ms       1.080us          3030  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.030ms         3.72%       2.030ms       1.322us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.822ms         3.34%       1.822ms       1.186us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.585ms         2.90%       1.585ms       1.016us          1560  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 940.182ms
-Self CUDA time total: 53.374ms
+Self CPU time total: 939.480ms
+Self CUDA time total: 54.581ms
 
 
 
@@ -4377,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.751s      1703.41%        1.751s        1.751s             1  
-                                           binned_torch        24.63%     431.727ms       100.00%        1.753s        1.753s       0.000us         0.00%     102.829ms     102.829ms             1  
-                                             aten::item         1.69%      29.621ms        25.96%     455.095ms      14.915us       0.000us         0.00%      31.387ms       1.029us         30513  
-                              aten::_local_scalar_dense         5.96%     104.552ms        24.27%     425.474ms      13.944us      31.383ms        30.52%      31.387ms       1.029us         30513  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      31.383ms        30.52%      31.383ms       1.029us         30513  
-                                              aten::bmm         0.01%     224.614us         0.02%     267.595us      44.599us      15.143ms        14.73%      15.143ms       2.524ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.143ms        14.73%      15.143ms       2.524ms             6  
-                                     aten::floor_divide         5.56%      97.549ms        13.34%     233.779ms      19.025us      15.089ms        14.68%      15.090ms       1.228us         12288  
-                                            aten::copy_         4.01%      70.283ms         9.47%     166.011ms      13.497us      13.317ms        12.95%      13.317ms       1.083us         12300  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.313ms        12.95%      13.313ms       1.083us         12294  
-                                              aten::mul         3.14%      55.060ms         5.66%      99.236ms      16.128us      11.295ms        10.99%      11.297ms       1.836us          6153  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.940ms         9.67%       9.940ms       1.618us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.059ms         7.84%       8.059ms       1.312us          6144  
-                                              aten::add         2.85%      49.952ms         4.90%      85.866ms      14.522us       7.505ms         7.30%       7.506ms       1.269us          5913  
-                                        aten::remainder         3.02%      53.015ms         4.74%      83.117ms      13.528us       7.414ms         7.21%       7.416ms       1.207us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.031ms         6.84%       7.031ms       1.144us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.224ms         6.05%       6.224ms       1.053us          5910  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.914ms         3.81%       3.914ms       1.274us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.500ms         3.40%       3.500ms       1.139us          3072  
-                                            aten::clamp         0.00%      71.603us         0.01%     117.833us      19.639us       1.180ms         1.15%       1.180ms     196.722us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.710s      1645.94%        1.710s        1.710s             1  
+                                           binned_torch        23.47%     401.594ms       100.00%        1.711s        1.711s       0.000us         0.00%     103.932ms     103.932ms             1  
+                                             aten::item         1.77%      30.361ms        27.00%     461.971ms      15.140us       0.000us         0.00%      31.541ms       1.034us         30513  
+                              aten::_local_scalar_dense         5.97%     102.153ms        25.22%     431.610ms      14.145us      31.538ms        30.35%      31.541ms       1.034us         30513  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      31.538ms        30.35%      31.538ms       1.034us         30513  
+                                     aten::floor_divide         5.77%      98.697ms        13.68%     234.018ms      19.044us      15.598ms        15.01%      15.600ms       1.270us         12288  
+                                              aten::bmm         0.01%     219.084us         0.02%     260.723us      43.454us      15.235ms        14.66%      15.235ms       2.539ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.235ms        14.66%      15.235ms       2.539ms             6  
+                                            aten::copy_         3.97%      67.926ms         9.38%     160.451ms      13.045us      13.315ms        12.81%      13.316ms       1.083us         12300  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.311ms        12.81%      13.311ms       1.083us         12294  
+                                              aten::mul         3.19%      54.637ms         5.82%      99.678ms      16.200us      11.250ms        10.83%      11.252ms       1.829us          6153  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.903ms         9.53%       9.903ms       1.612us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.304ms         7.99%       8.304ms       1.352us          6144  
+                                        aten::remainder         3.07%      52.461ms         4.79%      82.008ms      13.348us       7.670ms         7.38%       7.671ms       1.249us          6144  
+                                              aten::add         2.76%      47.163ms         4.86%      83.106ms      14.055us       7.632ms         7.34%       7.633ms       1.291us          5913  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.294ms         7.02%       7.294ms       1.187us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.354ms         6.11%       6.354ms       1.075us          5910  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.041ms         3.89%       4.041ms       1.316us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.629ms         3.49%       3.629ms       1.181us          3072  
+                                            aten::clamp         0.00%      71.350us         0.01%     113.931us      18.988us       1.190ms         1.15%       1.190ms     198.366us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.753s
-Self CUDA time total: 102.819ms
+Self CPU time total: 1.711s
+Self CUDA time total: 103.922ms
 
 
 
@@ -4409,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.834s      1680.90%        1.834s        1.834s             1  
-                                           binned_torch        24.76%     454.393ms       100.00%        1.835s        1.835s       0.000us         0.00%     109.119ms     109.119ms             1  
-                                             aten::item         1.65%      30.229ms        26.42%     484.819ms      14.374us       0.000us         0.00%      34.734ms       1.030us         33729  
-                              aten::_local_scalar_dense         6.08%     111.551ms        24.77%     454.590ms      13.478us      34.731ms        31.83%      34.734ms       1.030us         33729  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      34.731ms        31.83%      34.731ms       1.030us         33729  
-                                              aten::bmm         0.01%     219.836us         0.01%     260.868us      43.478us      15.243ms        13.97%      15.243ms       2.540ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.243ms        13.97%      15.243ms       2.540ms             6  
-                                     aten::floor_divide         5.37%      98.619ms        12.62%     231.581ms      18.846us      15.065ms        13.81%      15.065ms       1.226us         12288  
-                                            aten::copy_         3.65%      66.986ms         8.64%     158.623ms      12.896us      13.313ms        12.20%      13.316ms       1.083us         12300  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.309ms        12.20%      13.309ms       1.082us         12297  
-                                              aten::mul         2.96%      54.365ms         5.27%      96.616ms      15.702us      10.967ms        10.05%      10.969ms       1.783us          6153  
-                                              aten::add         4.05%      74.247ms         6.97%     127.934ms      14.060us      10.631ms         9.74%      10.631ms       1.168us          9099  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.613ms         8.81%       9.613ms       1.565us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.047ms         7.37%       8.047ms       1.310us          6144  
-                                        aten::remainder         2.81%      51.641ms         4.37%      80.193ms      13.052us       7.438ms         6.82%       7.438ms       1.211us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.018ms         6.43%       7.018ms       1.142us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.225ms         5.71%       6.225ms       1.053us          5910  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.928ms         3.60%       3.928ms       1.279us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.510ms         3.22%       3.510ms       1.143us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.154ms         2.89%       3.154ms       0.990us          3186  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.831s      1659.19%        1.831s        1.831s             1  
+                                           binned_torch        23.77%     435.469ms       100.00%        1.832s        1.832s       0.000us         0.00%     110.361ms     110.361ms             1  
+                                             aten::item         1.74%      31.875ms        27.52%     504.183ms      14.948us       0.000us         0.00%      34.964ms       1.037us         33729  
+                              aten::_local_scalar_dense         6.20%     113.521ms        25.78%     472.309ms      14.003us      34.961ms        31.68%      34.964ms       1.037us         33729  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      34.961ms        31.68%      34.961ms       1.037us         33729  
+                                     aten::floor_divide         5.21%      95.369ms        12.55%     229.877ms      18.707us      15.595ms        14.13%      15.597ms       1.269us         12288  
+                                              aten::bmm         0.01%     225.035us         0.01%     267.825us      44.638us      15.231ms        13.80%      15.231ms       2.539ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.231ms        13.80%      15.231ms       2.539ms             6  
+                                            aten::copy_         3.69%      67.648ms         8.80%     161.241ms      13.109us      13.343ms        12.09%      13.347ms       1.085us         12300  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.340ms        12.09%      13.340ms       1.085us         12297  
+                                              aten::mul         2.99%      54.761ms         5.39%      98.799ms      16.057us      10.934ms         9.91%      10.936ms       1.777us          6153  
+                                              aten::add         3.91%      71.612ms         6.90%     126.397ms      13.891us      10.863ms         9.84%      10.863ms       1.194us          9099  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.586ms         8.69%       9.586ms       1.560us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.308ms         7.53%       8.308ms       1.352us          6144  
+                                        aten::remainder         2.81%      51.395ms         4.41%      80.796ms      13.150us       7.688ms         6.97%       7.688ms       1.251us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.287ms         6.60%       7.287ms       1.186us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.364ms         5.77%       6.364ms       1.077us          5910  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.054ms         3.67%       4.054ms       1.320us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.634ms         3.29%       3.634ms       1.183us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.232ms         2.93%       3.232ms       1.014us          3186  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.835s
-Self CUDA time total: 109.111ms
+Self CPU time total: 1.832s
+Self CUDA time total: 110.351ms
 
 
 
@@ -4441,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.518s      1672.53%        3.518s        3.518s             1  
-                                           binned_torch        24.37%     858.118ms       100.00%        3.521s        3.521s       0.000us         0.00%     210.357ms     210.357ms             1  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      63.177ms        30.04%      63.177ms       1.026us         61586  
-                                             aten::item         1.69%      59.432ms        26.02%     916.275ms      14.878us       0.000us         0.00%      63.177ms       1.026us         61587  
-                              aten::_local_scalar_dense         5.96%     209.806ms        24.34%     856.843ms      13.913us      63.176ms        30.03%      63.177ms       1.026us         61587  
-                                     aten::floor_divide         5.42%     190.698ms        13.50%     475.217ms      19.337us      30.482ms        14.49%      30.486ms       1.240us         24576  
-                                              aten::bmm         0.01%     235.397us         0.01%     281.998us      47.000us      29.291ms        13.93%      29.291ms       4.882ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.291ms        13.93%      29.291ms       4.882ms             6  
-                                            aten::copy_         3.77%     132.744ms         9.15%     322.282ms      13.107us      26.808ms        12.75%      26.810ms       1.090us         24588  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.805ms        12.74%      26.805ms       1.090us         24582  
-                                              aten::mul         3.15%     110.895ms         5.78%     203.457ms      16.545us      25.566ms        12.15%      25.568ms       2.079us         12297  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.101ms        10.51%      22.101ms       1.799us         12288  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.470ms         7.83%      16.470ms       1.340us         12288  
-                                              aten::add         2.99%     105.439ms         5.15%     181.211ms      14.601us      16.115ms         7.66%      16.116ms       1.298us         12411  
-                                        aten::remainder         2.99%     105.111ms         4.72%     166.195ms      13.525us      14.836ms         7.05%      14.838ms       1.208us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.014ms         6.66%      14.014ms       1.140us         12288  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.996ms         6.18%      12.996ms       1.047us         12408  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.830ms         3.72%       7.830ms       1.274us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.006ms         3.33%       7.006ms       1.140us          6144  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.626ms         1.25%       2.626ms     437.595us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.493s      1641.52%        3.493s        3.493s             1  
+                                           binned_torch        23.72%     828.141ms       100.00%        3.492s        3.492s       0.000us         0.00%     212.777ms     212.777ms             1  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      63.619ms        29.90%      63.619ms       1.033us         61586  
+                                             aten::item         1.76%      61.470ms        26.76%     934.319ms      15.171us       0.000us         0.00%      63.619ms       1.033us         61587  
+                              aten::_local_scalar_dense         5.95%     207.894ms        25.00%     872.849ms      14.173us      63.616ms        29.90%      63.619ms       1.033us         61587  
+                                     aten::floor_divide         5.53%     193.077ms        13.34%     465.879ms      18.957us      31.606ms        14.86%      31.612ms       1.286us         24576  
+                                              aten::bmm         0.01%     236.694us         0.01%     284.594us      47.432us      29.067ms        13.66%      29.067ms       4.844ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.067ms        13.66%      29.067ms       4.844ms             6  
+                                            aten::copy_         3.89%     135.756ms         9.33%     325.881ms      13.254us      26.713ms        12.56%      26.714ms       1.086us         24588  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.711ms        12.55%      26.711ms       1.087us         24582  
+                                              aten::mul         3.15%     110.066ms         5.73%     199.944ms      16.260us      25.593ms        12.03%      25.595ms       2.081us         12297  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.131ms        10.40%      22.131ms       1.801us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      17.009ms         7.99%      17.009ms       1.384us         12288  
+                                              aten::add         2.82%      98.495ms         4.98%     173.932ms      14.014us      16.658ms         7.83%      16.659ms       1.342us         12411  
+                                        aten::remainder         3.04%     106.037ms         4.77%     166.563ms      13.555us      15.433ms         7.25%      15.435ms       1.256us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.597ms         6.86%      14.597ms       1.188us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.527ms         6.36%      13.527ms       1.090us         12408  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.132ms         3.82%       8.132ms       1.324us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.300ms         3.43%       7.300ms       1.188us          6144  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.623ms         1.23%       2.623ms     437.201us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.521s
-Self CUDA time total: 210.342ms
+Self CPU time total: 3.492s
+Self CUDA time total: 212.763ms
 
 
 
@@ -4473,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.742s      1679.57%        3.742s        3.742s             1  
-                                           binned_torch        24.42%     914.204ms       100.00%        3.744s        3.744s       0.000us         0.00%     222.834ms     222.834ms             1  
-                                             aten::item         1.73%      64.729ms        26.53%     993.125ms      14.638us       0.000us         0.00%      69.848ms       1.030us         67845  
-                              aten::_local_scalar_dense         6.14%     229.850ms        24.80%     928.396ms      13.684us      69.844ms        31.35%      69.848ms       1.030us         67845  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      69.844ms        31.35%      69.844ms       1.030us         67841  
-                                     aten::floor_divide         5.29%     197.931ms        12.52%     468.921ms      19.080us      30.509ms        13.69%      30.515ms       1.242us         24576  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.140ms        13.08%      29.140ms       4.857ms             6  
-                                              aten::bmm         0.01%     232.675us         0.01%     273.538us      45.590us      29.140ms        13.08%      29.140ms       4.857ms             6  
-                                            aten::copy_         3.66%     136.881ms         8.73%     326.908ms      13.295us      26.646ms        11.96%      26.647ms       1.084us         24588  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.643ms        11.96%      26.643ms       1.084us         24581  
-                                              aten::mul         2.96%     110.832ms         5.24%     196.253ms      15.959us      25.520ms        11.45%      25.522ms       2.075us         12297  
-                                              aten::add         4.16%     155.619ms         7.13%     266.948ms      14.322us      22.169ms         9.95%      22.169ms       1.189us         18639  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.076ms         9.91%      22.076ms       1.797us         12288  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.462ms         7.39%      16.462ms       1.340us         12287  
-                                        aten::remainder         2.77%     103.887ms         4.33%     162.240ms      13.203us      14.877ms         6.68%      14.879ms       1.211us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.047ms         6.30%      14.047ms       1.143us         12287  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.957ms         5.82%      12.957ms       1.044us         12407  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.856ms         3.53%       7.856ms       1.279us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.021ms         3.15%       7.021ms       1.143us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.109ms         2.74%       6.109ms       0.981us          6228  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.669s      1629.04%        3.669s        3.669s             1  
+                                           binned_torch        23.71%     870.025ms       100.00%        3.670s        3.670s       0.000us         0.00%     225.217ms     225.217ms             1  
+                                             aten::item         1.74%      63.801ms        26.98%     990.130ms      14.594us       0.000us         0.00%      69.736ms       1.028us         67845  
+                              aten::_local_scalar_dense         5.93%     217.737ms        25.24%     926.329ms      13.654us      69.731ms        30.96%      69.736ms       1.028us         67845  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      69.731ms        30.96%      69.731ms       1.028us         67841  
+                                     aten::floor_divide         5.15%     189.112ms        12.36%     453.770ms      18.464us      31.523ms        14.00%      31.529ms       1.283us         24576  
+                                              aten::bmm         0.01%     229.594us         0.01%     272.075us      45.346us      28.926ms        12.84%      28.926ms       4.821ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      28.926ms        12.84%      28.926ms       4.821ms             6  
+                                            aten::copy_         3.90%     143.149ms         8.93%     327.628ms      13.325us      26.721ms        11.87%      26.722ms       1.087us         24588  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.719ms        11.86%      26.719ms       1.087us         24581  
+                                              aten::mul         3.13%     114.822ms         5.47%     200.852ms      16.333us      25.594ms        11.37%      25.596ms       2.081us         12297  
+                                              aten::add         3.87%     141.881ms         6.78%     248.742ms      13.345us      23.243ms        10.32%      23.243ms       1.247us         18639  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.132ms         9.83%      22.132ms       1.801us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.988ms         7.54%      16.988ms       1.383us         12287  
+                                        aten::remainder         2.85%     104.729ms         4.42%     162.304ms      13.208us      15.354ms         6.82%      15.355ms       1.250us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.535ms         6.45%      14.535ms       1.183us         12287  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.676ms         6.07%      13.676ms       1.102us         12407  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.096ms         3.60%       8.096ms       1.318us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.258ms         3.22%       7.258ms       1.181us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.475ms         2.88%       6.475ms       1.040us          6228  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.744s
-Self CUDA time total: 222.814ms
+Self CPU time total: 3.670s
+Self CUDA time total: 225.199ms
 
 
 
@@ -4505,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        6.967s      1665.27%        6.967s        6.967s             1  
-                                           binned_torch        24.68%        1.721s       100.00%        6.973s        6.973s       0.000us         0.00%     418.392ms     418.392ms             1  
-                                             aten::item         1.64%     114.231ms        25.94%        1.809s      14.732us       0.000us         0.00%     125.163ms       1.020us        122763  
-                              aten::_local_scalar_dense         5.97%     416.624ms        24.30%        1.694s      13.802us     125.151ms        29.91%     125.163ms       1.020us        122763  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     125.151ms        29.91%     125.151ms       1.019us        122762  
-                                     aten::floor_divide         5.62%     391.846ms        13.33%     929.253ms      18.906us      61.051ms        14.59%      61.053ms       1.242us         49152  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.281ms        13.69%      57.281ms       9.547ms             6  
-                                              aten::bmm         0.00%     234.996us         0.00%     276.787us      46.131us      57.281ms        13.69%      57.281ms       9.547ms             6  
-                                            aten::copy_         3.92%     273.517ms         9.35%     652.240ms      13.268us      53.435ms        12.77%      53.437ms       1.087us         49158  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.433ms        12.77%      53.433ms       1.087us         49154  
-                                              aten::mul         3.15%     219.950ms         5.62%     391.612ms      15.929us      51.411ms        12.29%      51.419ms       2.091us         24585  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.451ms        10.62%      44.451ms       1.809us         24576  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      32.993ms         7.89%      32.993ms       1.343us         24576  
-                                              aten::add         2.87%     200.428ms         4.94%     344.166ms      14.085us      31.887ms         7.62%      31.889ms       1.305us         24435  
-                                        aten::remainder         3.00%     208.953ms         4.67%     325.902ms      13.261us      29.680ms         7.09%      29.684ms       1.208us         24576  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.059ms         6.71%      28.059ms       1.142us         24576  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.247ms         6.03%      25.247ms       1.033us         24431  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.667ms         3.74%      15.667ms       1.275us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.014ms         3.35%      14.014ms       1.140us         12288  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.233ms         1.25%       5.233ms     872.184us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        6.859s      1611.59%        6.859s        6.859s             1  
+                                           binned_torch        24.10%        1.655s       100.00%        6.866s        6.866s       0.000us         0.00%     425.661ms     425.661ms             1  
+                                             aten::item         1.68%     115.068ms        26.29%        1.805s      14.704us       0.000us         0.00%     127.116ms       1.035us        122763  
+                              aten::_local_scalar_dense         5.74%     393.879ms        24.61%        1.690s      13.764us     127.109ms        29.86%     127.116ms       1.035us        122763  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     127.110ms        29.86%     127.110ms       1.035us        122762  
+                                     aten::floor_divide         5.46%     374.656ms        13.09%     898.826ms      18.287us      63.404ms        14.90%      63.408ms       1.290us         49152  
+                                              aten::bmm         0.00%     234.973us         0.00%     276.793us      46.132us      56.971ms        13.38%      56.971ms       9.495ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      56.971ms        13.38%      56.971ms       9.495ms             6  
+                                            aten::copy_         4.17%     286.167ms         9.49%     651.750ms      13.258us      53.615ms        12.60%      53.616ms       1.091us         49158  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.612ms        12.60%      53.612ms       1.091us         49154  
+                                              aten::mul         3.34%     229.543ms         5.86%     402.465ms      16.370us      51.556ms        12.11%      51.561ms       2.097us         24585  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.609ms        10.48%      44.609ms       1.815us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      34.184ms         8.03%      34.184ms       1.391us         24576  
+                                              aten::add         2.69%     184.813ms         4.71%     323.308ms      13.231us      33.584ms         7.89%      33.588ms       1.375us         24435  
+                                        aten::remainder         3.06%     210.055ms         4.75%     326.044ms      13.267us      30.927ms         7.27%      30.931ms       1.259us         24576  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.221ms         6.87%      29.221ms       1.189us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      26.946ms         6.33%      26.946ms       1.103us         24431  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.291ms         3.83%      16.291ms       1.326us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.637ms         3.44%      14.637ms       1.191us         12288  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.222ms         1.23%       5.222ms     870.407us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.973s
-Self CUDA time total: 418.361ms
+Self CPU time total: 6.866s
+Self CUDA time total: 425.634ms
 
 
 
@@ -4537,40 +4319,40 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.368s      1660.72%        7.368s        7.368s             1  
-                                           binned_torch        24.39%        1.797s       100.00%        7.370s        7.370s       0.000us         0.00%     443.698ms     443.698ms             1  
-                                             aten::item         1.69%     124.742ms        26.51%        1.954s      14.504us       0.000us         0.00%     137.717ms       1.022us        134715  
-                              aten::_local_scalar_dense         6.11%     450.407ms        24.82%        1.829s      13.577us     137.708ms        31.04%     137.717ms       1.022us        134715  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     137.710ms        31.04%     137.710ms       1.022us        134711  
-                                     aten::floor_divide         5.42%     399.563ms        12.65%     932.414ms      18.970us      61.071ms        13.77%      61.077ms       1.243us         49152  
-                                              aten::bmm         0.00%     230.664us         0.00%     272.466us      45.411us      57.304ms        12.92%      57.304ms       9.551ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.304ms        12.92%      57.304ms       9.551ms             6  
-                                            aten::copy_         3.65%     269.132ms         8.67%     639.259ms      13.004us      54.065ms        12.19%      54.067ms       1.100us         49158  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.062ms        12.19%      54.062ms       1.100us         49153  
-                                              aten::mul         2.96%     217.959ms         5.26%     387.551ms      15.764us      51.653ms        11.64%      51.660ms       2.101us         24585  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.653ms        10.06%      44.653ms       1.817us         24576  
-                                              aten::add         4.03%     296.962ms         6.96%     512.647ms      14.100us      43.690ms         9.85%      43.694ms       1.202us         36357  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      32.954ms         7.43%      32.954ms       1.341us         24575  
-                                        aten::remainder         2.83%     208.527ms         4.40%     323.906ms      13.180us      29.662ms         6.69%      29.664ms       1.207us         24576  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.119ms         6.34%      28.119ms       1.144us         24576  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.409ms         5.73%      25.409ms       1.040us         24431  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.666ms         3.53%      15.666ms       1.275us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.995ms         3.15%      13.995ms       1.139us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.644ms         2.62%      11.644ms       0.977us         11922  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.331s      1630.84%        7.331s        7.331s             1  
+                                           binned_torch        23.92%        1.754s       100.00%        7.333s        7.333s       0.000us         0.00%     449.578ms     449.578ms             1  
+                                             aten::item         1.73%     127.153ms        27.44%        2.013s      14.940us       0.000us         0.00%     139.264ms       1.034us        134715  
+                              aten::_local_scalar_dense         6.23%     456.926ms        25.71%        1.885s      13.996us     139.253ms        30.98%     139.264ms       1.034us        134715  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     139.255ms        30.98%     139.255ms       1.034us        134707  
+                                     aten::floor_divide         5.02%     368.091ms        12.28%     900.843ms      18.328us      63.383ms        14.10%      63.388ms       1.290us         49152  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      56.831ms        12.64%      56.831ms       9.472ms             6  
+                                              aten::bmm         0.00%     231.002us         0.00%     273.424us      45.571us      56.831ms        12.64%      56.831ms       9.472ms             6  
+                                            aten::copy_         3.67%     268.957ms         8.71%     638.523ms      12.989us      53.771ms        11.96%      53.773ms       1.094us         49158  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.768ms        11.96%      53.768ms       1.094us         49149  
+                                              aten::mul         2.96%     217.228ms         5.34%     391.576ms      15.927us      51.518ms        11.46%      51.524ms       2.096us         24585  
+                                              aten::add         3.83%     280.607ms         6.79%     497.692ms      13.689us      45.514ms        10.12%      45.518ms       1.252us         36357  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.542ms         9.91%      44.542ms       1.812us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      34.127ms         7.59%      34.127ms       1.389us         24573  
+                                        aten::remainder         2.85%     209.203ms         4.50%     330.314ms      13.441us      30.793ms         6.85%      30.795ms       1.253us         24576  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.257ms         6.51%      29.257ms       1.191us         24573  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      26.610ms         5.92%      26.610ms       1.089us         24431  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.233ms         3.61%      16.233ms       1.321us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.559ms         3.24%      14.559ms       1.185us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.261ms         2.73%      12.261ms       1.028us         11922  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.370s
-Self CUDA time total: 443.660ms
+Self CPU time total: 7.333s
+Self CUDA time total: 449.542ms
 
 
 impl                     wl                  p50(ms)  ok
-binned_torch             cuda_B1_S1024_E2     372.79  True
-binned_torch             cuda_B1_S1024_E4     382.68  True
-binned_torch             cuda_B1_S512_E2      150.05  True
-binned_torch             cuda_B1_S512_E4      200.26  True
-binned_torch             cuda_B4_S1024_E2    1486.48  True
-binned_torch             cuda_B4_S1024_E4    1524.50  True
-binned_torch             cuda_B4_S512_E2      742.02  True
-binned_torch             cuda_B4_S512_E4      801.90  True
+binned_torch             cuda_B1_S1024_E2     367.62  True
+binned_torch             cuda_B1_S1024_E4     394.19  True
+binned_torch             cuda_B1_S512_E2      154.67  True
+binned_torch             cuda_B1_S512_E4      201.50  True
+binned_torch             cuda_B4_S1024_E2    1483.54  True
+binned_torch             cuda_B4_S1024_E4    1601.90  True
+binned_torch             cuda_B4_S512_E2      736.26  True
+binned_torch             cuda_B4_S512_E4      798.88  True
 

Artifacts: