diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.22s | Raw @@ -4122,16 +3904,16 @@ Cell: nv | 0.21s
-
Fri Oct 31 20:00:25 2025       
+
Mon Nov 10 21:57:49 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   27C    P0             77W /  350W |       0MiB /  46068MiB |     18%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,7 +3937,7 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 3.68s
+Cell: benchmark | 3.89s
  | 
 
 Raw
@@ -4217,29 +3999,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     439.324us      2269.12%     439.324us     439.324us             1  
-                                            torch_eager        10.31%     220.478us        99.69%       2.131ms       2.131ms       0.000us         0.00%      21.729us      21.729us             1  
-                                               aten::to         0.50%      10.770us        79.87%       1.707ms     284.530us       0.000us         0.00%      14.369us       2.395us             6  
-                                         aten::_to_copy         1.71%      36.499us        79.36%       1.696ms     282.735us       0.000us         0.00%      14.369us       2.395us             6  
-                                            aten::copy_         2.77%      59.234us        75.21%       1.608ms     267.930us      12.001us        61.99%      14.369us       2.395us             6  
-                                           aten::conv1d         0.36%       7.590us         7.34%     156.883us      52.294us       0.000us         0.00%       7.360us       2.453us             3  
-                                      aten::convolution         0.66%      14.070us         6.98%     149.293us      49.764us       0.000us         0.00%       7.360us       2.453us             3  
-                                     aten::_convolution         1.51%      32.210us         6.33%     135.223us      45.074us       0.000us         0.00%       7.360us       2.453us             3  
-                                aten::_conv_depthwise2d         1.61%      34.371us         4.00%      85.463us      28.488us       7.360us        38.01%       7.360us       2.453us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        38.01%       7.360us       2.453us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.337us        32.73%       6.337us       2.112us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.25%       5.664us       1.888us             3  
-                                Activity Buffer Request        69.37%       1.483ms        69.37%       1.483ms       1.483ms       2.368us        12.23%       2.368us       2.368us             1  
-                                    aten::empty_strided         2.45%      52.331us         2.45%      52.331us       8.722us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.26%      91.032us         4.26%      91.032us      10.115us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.32%      28.311us         1.71%      36.491us       4.055us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.64%      13.700us         0.64%      13.700us       0.913us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.60%      12.790us         0.60%      12.790us       4.263us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.59%      12.710us         0.59%      12.710us       4.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       6.640us         0.38%       8.090us       2.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     411.136us      2127.15%     411.136us     411.136us             1  
+                                            torch_eager         8.60%     205.173us        99.40%       2.372ms       2.372ms       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.40%       9.649us        83.06%       1.982ms     330.358us       0.000us         0.00%      14.272us       2.379us             6  
+                                         aten::_to_copy         1.47%      35.141us        82.65%       1.973ms     328.750us       0.000us         0.00%      14.272us       2.379us             6  
+                                            aten::copy_         2.42%      57.830us        79.13%       1.889ms     314.753us      11.968us        61.92%      14.272us       2.379us             6  
+                                           aten::conv1d         0.32%       7.640us         6.22%     148.384us      49.461us       0.000us         0.00%       7.360us       2.453us             3  
+                                      aten::convolution         0.55%      13.222us         5.90%     140.744us      46.915us       0.000us         0.00%       7.360us       2.453us             3  
+                                     aten::_convolution         1.23%      29.427us         5.34%     127.522us      42.507us       0.000us         0.00%       7.360us       2.453us             3  
+                                aten::_conv_depthwise2d         1.41%      33.690us         3.44%      82.073us      27.358us       7.360us        38.08%       7.360us       2.453us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        38.08%       7.360us       2.453us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.62%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.30%       5.664us       1.888us             3  
+                                Activity Buffer Request        73.85%       1.762ms        73.85%       1.762ms       1.762ms       2.304us        11.92%       2.304us       2.304us             1  
+                                    aten::empty_strided         2.05%      48.841us         2.05%      48.841us       8.140us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.88%      92.484us         3.88%      92.484us      10.276us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.94%      22.551us         1.23%      29.352us       3.261us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%      10.991us         0.46%      10.991us       0.733us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      12.660us         0.53%      12.660us       4.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.49%      11.631us         0.49%      11.631us       3.877us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       6.340us         0.32%       7.570us       2.523us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.138ms
-Self CUDA time total: 19.361us
+Self CPU time total: 2.386ms
+Self CUDA time total: 19.328us
 
 
 
@@ -4249,29 +4031,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.789us      1742.49%     341.789us     341.789us             1  
-                                            torch_eager         7.86%     151.082us        99.71%       1.916ms       1.916ms       0.000us         0.00%      21.695us      21.695us             1  
-                                               aten::to         0.35%       6.661us        83.96%       1.614ms     268.966us       0.000us         0.00%      13.695us       2.282us             6  
-                                         aten::_to_copy         1.29%      24.781us        83.61%       1.607ms     267.856us       0.000us         0.00%      13.695us       2.282us             6  
-                                            aten::copy_         2.59%      49.784us        80.72%       1.552ms     258.589us      11.615us        59.21%      13.695us       2.282us             6  
-                                           aten::conv1d         0.32%       6.220us         6.35%     122.113us      40.704us       0.000us         0.00%       8.000us       2.667us             3  
-                                      aten::convolution         0.53%      10.120us         6.03%     115.893us      38.631us       0.000us         0.00%       8.000us       2.667us             3  
-                                     aten::_convolution         1.20%      23.080us         5.50%     105.773us      35.258us       0.000us         0.00%       8.000us       2.667us             3  
-                                aten::_conv_depthwise2d         1.19%      22.952us         3.39%      65.123us      21.708us       8.000us        40.79%       8.000us       2.667us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.000us        40.79%       8.000us       2.667us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        30.83%       6.047us       2.016us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.39%       5.568us       1.856us             3  
-                                Activity Buffer Request        75.54%       1.452ms        75.54%       1.452ms       1.452ms       2.080us        10.60%       2.080us       2.080us             1  
-                                    aten::empty_strided         1.60%      30.820us         1.60%      30.820us       5.137us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.74%      71.953us         3.74%      71.953us       7.995us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.98%      18.881us         1.29%      24.750us       2.750us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.50%       9.609us         0.50%       9.609us       0.641us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.56%      10.750us         0.56%      10.750us       3.583us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.49%       9.339us         0.49%       9.339us       3.113us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.630us         0.42%       8.000us       2.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.094us      1629.14%     320.094us     320.094us             1  
+                                            torch_eager         6.61%     147.267us        99.75%       2.222ms       2.222ms       0.000us         0.00%      21.856us      21.856us             1  
+                                               aten::to         0.28%       6.328us        86.86%       1.935ms     322.525us       0.000us         0.00%      13.888us       2.315us             6  
+                                         aten::_to_copy         0.99%      22.058us        86.58%       1.929ms     321.470us       0.000us         0.00%      13.888us       2.315us             6  
+                                            aten::copy_         2.09%      46.581us        84.13%       1.874ms     312.384us      11.680us        59.45%      13.888us       2.315us             6  
+                                           aten::conv1d         0.26%       5.880us         5.20%     115.901us      38.634us       0.000us         0.00%       7.968us       2.656us             3  
+                                      aten::convolution         0.41%       9.201us         4.94%     110.021us      36.674us       0.000us         0.00%       7.968us       2.656us             3  
+                                     aten::_convolution         0.99%      22.029us         4.53%     100.820us      33.607us       0.000us         0.00%       7.968us       2.656us             3  
+                                aten::_conv_depthwise2d         0.98%      21.809us         2.84%      63.210us      21.070us       7.968us        40.55%       7.968us       2.656us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        40.55%       7.968us       2.656us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.112us        31.11%       6.112us       2.037us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.34%       5.568us       1.856us             3  
+                                Activity Buffer Request        79.89%       1.780ms        79.89%       1.780ms       1.780ms       2.208us        11.24%       2.208us       2.208us             1  
+                                    aten::empty_strided         1.46%      32.461us         1.46%      32.461us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.22%      71.802us         3.22%      71.802us       7.978us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.71%      15.809us         0.93%      20.750us       2.306us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.492us         0.38%       8.492us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.41%       9.081us         0.41%       9.081us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.38%       8.530us         0.38%       8.530us       2.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.26%       5.730us         0.32%       7.140us       2.380us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.922ms
-Self CUDA time total: 19.615us
+Self CPU time total: 2.228ms
+Self CUDA time total: 19.648us
 
 
 
@@ -4281,29 +4063,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     343.328us      1837.45%     343.328us     343.328us             1  
-                                            torch_eager         7.88%     151.015us        99.69%       1.911ms       1.911ms       0.000us         0.00%      20.605us      20.605us             1  
-                                               aten::to         0.33%       6.409us        84.02%       1.611ms     268.468us       0.000us         0.00%      13.662us       2.277us             6  
-                                         aten::_to_copy         1.32%      25.354us        83.68%       1.604ms     267.400us       0.000us         0.00%      13.662us       2.277us             6  
-                                            aten::copy_         2.65%      50.770us        80.80%       1.549ms     258.170us      11.742us        62.84%      13.662us       2.277us             6  
-                                           aten::conv1d         0.33%       6.290us         6.34%     121.483us      40.494us       0.000us         0.00%       6.943us       2.314us             3  
-                                      aten::convolution         0.54%      10.430us         6.01%     115.193us      38.398us       0.000us         0.00%       6.943us       2.314us             3  
-                                     aten::_convolution         1.17%      22.439us         5.46%     104.763us      34.921us       0.000us         0.00%       6.943us       2.314us             3  
-                                aten::_conv_depthwise2d         1.17%      22.412us         3.43%      65.843us      21.948us       6.943us        37.16%       6.943us       2.314us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.943us        37.16%       6.943us       2.314us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.982us        32.01%       5.982us       1.994us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        30.83%       5.760us       1.920us             3  
-                                Activity Buffer Request        75.50%       1.448ms        75.50%       1.448ms       1.448ms       1.920us        10.28%       1.920us       1.920us             1  
-                                    aten::empty_strided         1.57%      30.029us         1.57%      30.029us       5.005us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.90%      74.680us         3.90%      74.680us       8.298us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.93%      17.782us         1.21%      23.252us       2.584us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       9.281us         0.48%       9.281us       0.619us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.57%      10.910us         0.57%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       8.531us         0.44%       8.531us       2.844us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.170us         0.39%       7.570us       2.523us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.750us      1724.09%     322.750us     322.750us             1  
+                                            torch_eager         6.97%     154.353us        99.74%       2.208ms       2.208ms       0.000us         0.00%      20.736us      20.736us             1  
+                                               aten::to         0.30%       6.580us        86.44%       1.913ms     318.849us       0.000us         0.00%      13.791us       2.299us             6  
+                                         aten::_to_copy         1.09%      24.161us        86.14%       1.907ms     317.752us       0.000us         0.00%      13.791us       2.299us             6  
+                                            aten::copy_         2.12%      46.909us        83.64%       1.851ms     308.533us      11.775us        62.90%      13.791us       2.299us             6  
+                                           aten::conv1d         0.30%       6.591us         5.18%     114.662us      38.221us       0.000us         0.00%       6.945us       2.315us             3  
+                                      aten::convolution         0.40%       8.811us         4.88%     108.071us      36.024us       0.000us         0.00%       6.945us       2.315us             3  
+                                     aten::_convolution         0.96%      21.188us         4.48%      99.260us      33.087us       0.000us         0.00%       6.945us       2.315us             3  
+                                aten::_conv_depthwise2d         0.97%      21.520us         2.82%      62.461us      20.820us       6.945us        37.10%       6.945us       2.315us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.945us        37.10%       6.945us       2.315us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        32.30%       6.047us       2.016us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        30.60%       5.728us       1.909us             3  
+                                Activity Buffer Request        79.41%       1.758ms        79.41%       1.758ms       1.758ms       2.016us        10.77%       2.016us       2.016us             1  
+                                    aten::empty_strided         1.41%      31.151us         1.41%      31.151us       5.192us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.17%      70.153us         3.17%      70.153us       7.795us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.77%      17.060us         1.01%      22.310us       2.479us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.641us         0.39%       8.641us       0.576us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.42%       9.380us         0.42%       9.380us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       8.090us         0.37%       8.090us       2.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.25%       5.450us         0.31%       6.801us       2.267us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.917ms
-Self CUDA time total: 18.685us
+Self CPU time total: 2.213ms
+Self CUDA time total: 18.720us
 
 
 
@@ -4313,29 +4095,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.280us      1734.88%     340.280us     340.280us             1  
-                                            torch_eager         6.89%     141.563us        99.72%       2.049ms       2.049ms       0.000us         0.00%      21.726us      21.726us             1  
-                                               aten::to         0.30%       6.132us        85.38%       1.755ms     292.424us       0.000us         0.00%      13.982us       2.330us             6  
-                                         aten::_to_copy         1.19%      24.439us        85.08%       1.748ms     291.402us       0.000us         0.00%      13.982us       2.330us             6  
-                                            aten::copy_         2.50%      51.302us        82.39%       1.693ms     282.182us      11.870us        60.52%      13.982us       2.330us             6  
-                                           aten::conv1d         0.29%       5.930us         5.97%     122.723us      40.908us       0.000us         0.00%       7.744us       2.581us             3  
-                                      aten::convolution         0.50%      10.300us         5.68%     116.793us      38.931us       0.000us         0.00%       7.744us       2.581us             3  
-                                     aten::_convolution         1.17%      23.960us         5.18%     106.493us      35.498us       0.000us         0.00%       7.744us       2.581us             3  
-                                aten::_conv_depthwise2d         1.08%      22.141us         3.19%      65.452us      21.817us       7.744us        39.48%       7.744us       2.581us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        39.48%       7.744us       2.581us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.143us        31.32%       6.143us       2.048us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.727us        29.20%       5.727us       1.909us             3  
-                                Activity Buffer Request        70.00%       1.438ms        70.00%       1.438ms       1.438ms       2.112us        10.77%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.50%      30.881us         1.50%      30.881us       5.147us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.01%     226.194us        11.01%     226.194us      25.133us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      18.302us         1.19%      24.432us       2.715us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.49%       9.981us         0.49%       9.981us       0.665us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      11.260us         0.55%      11.260us       3.753us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.171us         0.45%       9.171us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.620us         0.39%       8.030us       2.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.254us      1673.31%     328.254us     328.254us             1  
+                                            torch_eager         6.02%     146.742us        99.79%       2.431ms       2.431ms       0.000us         0.00%      21.729us      21.729us             1  
+                                               aten::to         0.25%       6.201us        87.89%       2.141ms     356.794us       0.000us         0.00%      14.048us       2.341us             6  
+                                         aten::_to_copy         0.95%      23.051us        87.64%       2.135ms     355.761us       0.000us         0.00%      14.048us       2.341us             6  
+                                            aten::copy_         1.93%      46.899us        85.39%       2.080ms     346.662us      11.936us        60.85%      14.048us       2.341us             6  
+                                           aten::conv1d         0.28%       6.941us         4.83%     117.552us      39.184us       0.000us         0.00%       7.681us       2.560us             3  
+                                      aten::convolution         0.38%       9.320us         4.54%     110.611us      36.870us       0.000us         0.00%       7.681us       2.560us             3  
+                                     aten::_convolution         0.86%      20.861us         4.16%     101.291us      33.764us       0.000us         0.00%       7.681us       2.560us             3  
+                                aten::_conv_depthwise2d         0.93%      22.752us         2.67%      64.991us      21.664us       7.681us        39.15%       7.681us       2.560us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.681us        39.15%       7.681us       2.560us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        31.65%       6.208us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.20%       5.728us       1.909us             3  
+                                Activity Buffer Request        75.50%       1.839ms        75.50%       1.839ms       1.839ms       2.112us        10.77%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.29%      31.540us         1.29%      31.540us       5.257us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.87%     216.103us         8.87%     216.103us      24.011us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.989us         0.90%      21.970us       2.441us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.35%       8.601us         0.35%       8.601us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%      10.359us         0.43%      10.359us       3.453us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       9.840us         0.40%       9.840us       3.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.410us         0.28%       6.920us       2.307us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.055ms
-Self CUDA time total: 19.614us
+Self CPU time total: 2.436ms
+Self CUDA time total: 19.617us
 
 
 
@@ -4345,29 +4127,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     379.964us      1548.03%     379.964us     379.964us             1  
-                                            torch_eager         7.69%     160.944us        99.76%       2.089ms       2.089ms       0.000us         0.00%      26.817us      26.817us             1  
-                                               aten::to         0.33%       7.000us        83.76%       1.754ms     292.349us       0.000us         0.00%      15.265us       2.544us             6  
-                                         aten::_to_copy         1.23%      25.779us        83.43%       1.747ms     291.183us       0.000us         0.00%      15.265us       2.544us             6  
-                                            aten::copy_         2.49%      52.100us        80.65%       1.689ms     281.484us      12.993us        52.94%      15.265us       2.544us             6  
-                                           aten::conv1d         0.31%       6.410us         6.85%     143.364us      47.788us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         1.48%      31.021us         6.54%     136.954us      45.651us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         1.13%      23.621us         5.06%     105.933us      35.311us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         1.06%      22.209us         3.13%      65.632us      21.877us      11.552us        47.06%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.06%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.625us        26.99%       6.625us       2.208us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        25.94%       6.368us       2.123us             3  
-                                Activity Buffer Request        68.76%       1.440ms        68.76%       1.440ms       1.440ms       2.272us         9.26%       2.272us       2.272us             1  
-                                    aten::empty_strided         1.55%      32.413us         1.55%      32.413us       5.402us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.50%     219.817us        10.50%     219.817us      24.424us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      18.301us         1.15%      24.061us       2.673us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.50%      10.530us         0.50%      10.530us       0.702us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      10.490us         0.50%      10.490us       3.497us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.872us         0.47%       9.872us       3.291us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.220us         0.37%       7.740us       2.580us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.374us      1318.69%     325.374us     325.374us             1  
+                                            torch_eager         6.23%     145.210us        99.78%       2.326ms       2.326ms       0.000us         0.00%      26.978us      26.978us             1  
+                                               aten::to         0.28%       6.471us        87.58%       2.041ms     340.232us       0.000us         0.00%      15.298us       2.550us             6  
+                                         aten::_to_copy         1.01%      23.559us        87.30%       2.035ms     339.154us       0.000us         0.00%      15.298us       2.550us             6  
+                                            aten::copy_         2.04%      47.563us        85.03%       1.982ms     330.320us      12.994us        52.66%      15.298us       2.550us             6  
+                                           aten::conv1d         0.26%       6.060us         4.91%     114.341us      38.114us       0.000us         0.00%      11.680us       3.893us             3  
+                                      aten::convolution         0.40%       9.250us         4.65%     108.281us      36.094us       0.000us         0.00%      11.680us       3.893us             3  
+                                     aten::_convolution         0.89%      20.669us         4.25%      99.031us      33.010us       0.000us         0.00%      11.680us       3.893us             3  
+                                aten::_conv_depthwise2d         0.95%      22.039us         2.73%      63.550us      21.183us      11.680us        47.34%      11.680us       3.893us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.680us        47.34%      11.680us       3.893us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.657us        26.98%       6.657us       2.219us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.337us        25.68%       6.337us       2.112us             3  
+                                Activity Buffer Request        74.59%       1.739ms        74.59%       1.739ms       1.739ms       2.304us         9.34%       2.304us       2.304us             1  
+                                    aten::empty_strided         1.26%      29.442us         1.26%      29.442us       4.907us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.39%     218.802us         9.39%     218.802us      24.311us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.69%      16.041us         0.91%      21.173us       2.353us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.602us         0.37%       8.602us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.341us         0.40%       9.341us       3.114us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       8.990us         0.39%       8.990us       2.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.290us         0.28%       6.580us       2.193us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.094ms
-Self CUDA time total: 24.545us
+Self CPU time total: 2.331ms
+Self CUDA time total: 24.674us
 
 
 
@@ -4377,29 +4159,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     351.133us      1341.43%     351.133us     351.133us             1  
-                                            torch_eager         7.55%     157.812us        99.73%       2.084ms       2.084ms       0.000us         0.00%      28.416us      28.416us             1  
-                                               aten::to         0.31%       6.571us        84.80%       1.772ms     295.318us       0.000us         0.00%      15.264us       2.544us             6  
-                                         aten::_to_copy         1.22%      25.450us        84.49%       1.765ms     294.223us       0.000us         0.00%      15.264us       2.544us             6  
-                                            aten::copy_         2.31%      48.301us        81.82%       1.710ms     284.947us      13.024us        49.76%      15.264us       2.544us             6  
-                                           aten::conv1d         0.32%       6.640us         5.96%     124.543us      41.514us       0.000us         0.00%      13.152us       4.384us             3  
-                                      aten::convolution         0.50%      10.360us         5.64%     117.903us      39.301us       0.000us         0.00%      13.152us       4.384us             3  
-                                     aten::_convolution         1.16%      24.330us         5.15%     107.543us      35.848us       0.000us         0.00%      13.152us       4.384us             3  
-                                aten::_conv_depthwise2d         1.06%      22.241us         3.14%      65.623us      21.874us      13.152us        50.24%      13.152us       4.384us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.152us        50.24%      13.152us       4.384us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        25.43%       6.656us       2.219us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.33%       6.368us       2.123us             3  
-                                Activity Buffer Request        70.10%       1.465ms        70.10%       1.465ms       1.465ms       2.240us         8.56%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.45%      30.202us         1.45%      30.202us       5.034us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.51%     219.677us        10.51%     219.677us      24.409us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.90%      18.881us         1.17%      24.421us       2.713us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.580us         0.46%       9.580us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      11.471us         0.55%      11.471us       3.824us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       8.890us         0.43%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       6.950us         0.40%       8.400us       2.800us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.853us      1241.91%     325.853us     325.853us             1  
+                                            torch_eager         6.02%     142.382us        99.78%       2.359ms       2.359ms       0.000us         0.00%      28.510us      28.510us             1  
+                                               aten::to         0.27%       6.279us        87.80%       2.076ms     345.959us       0.000us         0.00%      15.262us       2.544us             6  
+                                         aten::_to_copy         0.97%      22.980us        87.54%       2.069ms     344.912us       0.000us         0.00%      15.262us       2.544us             6  
+                                            aten::copy_         2.02%      47.672us        85.33%       2.017ms     336.189us      12.990us        49.51%      15.262us       2.544us             6  
+                                           aten::conv1d         0.27%       6.391us         4.88%     115.262us      38.421us       0.000us         0.00%      13.248us       4.416us             3  
+                                      aten::convolution         0.41%       9.629us         4.61%     108.871us      36.290us       0.000us         0.00%      13.248us       4.416us             3  
+                                     aten::_convolution         0.88%      20.800us         4.20%      99.242us      33.081us       0.000us         0.00%      13.248us       4.416us             3  
+                                aten::_conv_depthwise2d         0.93%      21.882us         2.62%      62.041us      20.680us      13.248us        50.49%      13.248us       4.416us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.248us        50.49%      13.248us       4.416us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.622us        25.24%       6.622us       2.207us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.27%       6.368us       2.123us             3  
+                                Activity Buffer Request        75.21%       1.778ms        75.21%       1.778ms       1.778ms       2.272us         8.66%       2.272us       2.272us             1  
+                                    aten::empty_strided         1.24%      29.361us         1.24%      29.361us       4.893us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.97%     212.032us         8.97%     212.032us      23.559us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.75%      17.821us         0.98%      23.130us       2.570us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.699us         0.37%       8.699us       0.580us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.38%       9.090us         0.38%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%      10.480us         0.44%      10.480us       3.493us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.631us         0.30%       7.011us       2.337us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.089ms
-Self CUDA time total: 26.176us
+Self CPU time total: 2.364ms
+Self CUDA time total: 26.238us
 
 
 
@@ -4409,29 +4191,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.627us       908.24%     349.627us     349.627us             1  
-                                            torch_eager         7.45%     152.992us        99.76%       2.049ms       2.049ms       0.000us         0.00%      41.086us      41.086us             1  
-                                           aten::conv1d         0.32%       6.640us         6.06%     124.413us      41.471us       0.000us         0.00%      22.561us       7.520us             3  
-                                      aten::convolution         0.50%      10.370us         5.73%     117.773us      39.258us       0.000us         0.00%      22.561us       7.520us             3  
-                                     aten::_convolution         1.14%      23.411us         5.23%     107.403us      35.801us       0.000us         0.00%      22.561us       7.520us             3  
-                                aten::_conv_depthwise2d         1.15%      23.650us         3.29%      67.532us      22.511us      22.561us        58.61%      22.561us       7.520us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.561us        58.61%      22.561us       7.520us             3  
-                                               aten::to         0.33%       6.780us        84.82%       1.743ms     290.446us       0.000us         0.00%      18.525us       3.087us             6  
-                                         aten::_to_copy         1.29%      26.502us        84.49%       1.736ms     289.316us       0.000us         0.00%      18.525us       3.087us             6  
-                                            aten::copy_         2.40%      49.251us        81.74%       1.679ms     279.869us      15.934us        41.39%      18.525us       3.087us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.543us        22.19%       8.543us       2.848us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        19.20%       7.391us       2.464us             3  
-                                Activity Buffer Request        69.84%       1.435ms        69.84%       1.435ms       1.435ms       2.591us         6.73%       2.591us       2.591us             1  
-                                    aten::empty_strided         1.47%      30.182us         1.47%      30.182us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.64%     218.664us        10.64%     218.664us      24.296us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      18.281us         1.17%      24.011us       2.668us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.739us         0.47%       9.739us       0.649us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.53%      10.991us         0.53%      10.991us       3.664us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       9.421us         0.46%       9.421us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.970us         0.36%       7.320us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     331.328us       858.50%     331.328us     331.328us             1  
+                                            torch_eager         5.97%     146.471us        99.79%       2.446ms       2.446ms       0.000us         0.00%      41.186us      41.186us             1  
+                                           aten::conv1d         0.25%       6.210us         4.77%     116.961us      38.987us       0.000us         0.00%      22.849us       7.616us             3  
+                                      aten::convolution         0.40%       9.740us         4.52%     110.751us      36.917us       0.000us         0.00%      22.849us       7.616us             3  
+                                     aten::_convolution         0.89%      21.911us         4.12%     101.011us      33.670us       0.000us         0.00%      22.849us       7.616us             3  
+                                aten::_conv_depthwise2d         0.92%      22.550us         2.59%      63.530us      21.177us      22.849us        59.20%      22.849us       7.616us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.849us        59.20%      22.849us       7.616us             3  
+                                               aten::to         0.25%       6.228us        88.01%       2.158ms     359.617us       0.000us         0.00%      18.337us       3.056us             6  
+                                         aten::_to_copy         1.00%      24.602us        87.76%       2.151ms     358.579us       0.000us         0.00%      18.337us       3.056us             6  
+                                            aten::copy_         1.98%      48.619us        85.49%       2.096ms     349.334us      15.745us        40.80%      18.337us       3.056us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.385us        21.73%       8.385us       2.795us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.07%       7.360us       2.453us             3  
+                                Activity Buffer Request        75.73%       1.857ms        75.73%       1.857ms       1.857ms       2.592us         6.72%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.26%      30.871us         1.26%      30.871us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.69%     213.074us         8.69%     213.074us      23.675us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.69%      16.899us         0.91%      22.302us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.35%       8.674us         0.35%       8.674us       0.578us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.39%       9.670us         0.39%       9.670us       3.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       9.000us         0.37%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.570us         0.28%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.054ms
-Self CUDA time total: 38.495us
+Self CPU time total: 2.452ms
+Self CUDA time total: 38.594us
 
 
 
@@ -4441,29 +4223,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     345.054us       837.81%     345.054us     345.054us             1  
-                                            torch_eager         7.39%     151.695us        99.75%       2.049ms       2.049ms       0.000us         0.00%      43.810us      43.810us             1  
-                                           aten::conv1d         0.32%       6.620us         6.03%     123.883us      41.294us       0.000us         0.00%      25.375us       8.458us             3  
-                                      aten::convolution         0.50%      10.320us         5.71%     117.263us      39.088us       0.000us         0.00%      25.375us       8.458us             3  
-                                     aten::_convolution         1.20%      24.592us         5.21%     106.943us      35.648us       0.000us         0.00%      25.375us       8.458us             3  
-                                aten::_conv_depthwise2d         1.13%      23.150us         3.19%      65.451us      21.817us      25.375us        61.61%      25.375us       8.458us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.375us        61.61%      25.375us       8.458us             3  
-                                               aten::to         0.31%       6.440us        84.93%       1.744ms     290.716us       0.000us         0.00%      18.435us       3.072us             6  
-                                         aten::_to_copy         1.24%      25.501us        84.61%       1.738ms     289.642us       0.000us         0.00%      18.435us       3.072us             6  
-                                            aten::copy_         2.41%      49.431us        81.91%       1.682ms     280.380us      15.810us        38.39%      18.435us       3.072us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.386us        20.36%       8.386us       2.795us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        18.03%       7.424us       2.475us             3  
-                                Activity Buffer Request        70.32%       1.444ms        70.32%       1.444ms       1.444ms       2.625us         6.37%       2.625us       2.625us             1  
-                                    aten::empty_strided         1.46%      30.070us         1.46%      30.070us       5.012us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.28%     211.144us        10.28%     211.144us      23.460us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      18.949us         1.19%      24.411us       2.712us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.313us         0.45%       9.313us       0.621us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%      10.601us         0.52%      10.601us       3.534us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       9.110us         0.44%       9.110us       3.037us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.930us         0.36%       7.410us       2.470us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.382us       781.00%     324.382us     324.382us             1  
+                                            torch_eager         6.15%     143.693us        99.76%       2.329ms       2.329ms       0.000us         0.00%      44.158us      44.158us             1  
+                                           aten::conv1d         0.25%       5.870us         4.90%     114.381us      38.127us       0.000us         0.00%      25.694us       8.565us             3  
+                                      aten::convolution         0.39%       9.129us         4.65%     108.511us      36.170us       0.000us         0.00%      25.694us       8.565us             3  
+                                     aten::_convolution         0.92%      21.560us         4.26%      99.382us      33.127us       0.000us         0.00%      25.694us       8.565us             3  
+                                aten::_conv_depthwise2d         0.91%      21.251us         2.67%      62.331us      20.777us      25.694us        61.86%      25.694us       8.565us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.694us        61.86%      25.694us       8.565us             3  
+                                               aten::to         0.26%       6.051us        87.64%       2.046ms     341.007us       0.000us         0.00%      18.464us       3.077us             6  
+                                         aten::_to_copy         0.99%      23.033us        87.38%       2.040ms     339.999us       0.000us         0.00%      18.464us       3.077us             6  
+                                            aten::copy_         2.09%      48.709us        85.05%       1.985ms     330.910us      15.840us        38.14%      18.464us       3.077us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        20.34%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        17.80%       7.392us       2.464us             3  
+                                Activity Buffer Request        74.80%       1.746ms        74.80%       1.746ms       1.746ms       2.624us         6.32%       2.624us       2.624us             1  
+                                    aten::empty_strided         1.35%      31.498us         1.35%      31.498us       5.250us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.10%     212.334us         9.10%     212.334us      23.593us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.311us         0.92%      21.550us       2.394us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.780us         0.38%       8.780us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.39%       9.170us         0.39%       9.170us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%      10.170us         0.44%      10.170us       3.390us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.530us         0.30%       6.891us       2.297us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.054ms
-Self CUDA time total: 41.185us
+Self CPU time total: 2.335ms
+Self CUDA time total: 41.534us
 
 
 
@@ -4473,29 +4255,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.348us       338.39%     348.348us     348.348us             1  
-                                            torch_eager         7.21%     148.863us        99.73%       2.059ms       2.059ms       0.000us         0.00%     108.926us     108.926us             1  
-                                           aten::conv1d         0.31%       6.430us         5.95%     122.893us      40.964us       0.000us         0.00%      70.592us      23.531us             3  
-                                      aten::convolution         0.50%      10.290us         5.64%     116.463us      38.821us       0.000us         0.00%      70.592us      23.531us             3  
-                                     aten::_convolution         1.17%      24.211us         5.14%     106.173us      35.391us       0.000us         0.00%      70.592us      23.531us             3  
-                                aten::_conv_depthwise2d         1.12%      23.052us         3.16%      65.282us      21.761us      70.592us        68.57%      70.592us      23.531us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.592us        68.57%      70.592us      23.531us             3  
-                                               aten::to         0.31%       6.372us        85.15%       1.758ms     292.949us       0.000us         0.00%      38.334us       6.389us             6  
-                                         aten::_to_copy         1.20%      24.680us        84.84%       1.751ms     291.887us       0.000us         0.00%      38.334us       6.389us             6  
-                                            aten::copy_         2.47%      51.072us        82.20%       1.697ms     282.787us      32.350us        31.43%      38.334us       6.389us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.695us        17.19%      17.695us       5.898us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.655us        14.24%      14.655us       4.885us             3  
-                                Activity Buffer Request        70.59%       1.457ms        70.59%       1.457ms       1.457ms       5.984us         5.81%       5.984us       5.984us             1  
-                                    aten::empty_strided         1.45%      29.921us         1.45%      29.921us       4.987us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.23%     211.264us        10.23%     211.264us      23.474us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      18.462us         1.17%      24.111us       2.679us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.709us         0.47%       9.709us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.780us         0.47%       9.780us       3.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.740us         0.47%       9.740us       3.247us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.880us         0.35%       7.260us       2.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.038us       307.34%     319.038us     319.038us             1  
+                                            torch_eager         4.95%     115.620us        99.75%       2.329ms       2.329ms       0.000us         0.00%     109.886us     109.886us             1  
+                                           aten::conv1d         0.24%       5.500us         4.79%     111.722us      37.241us       0.000us         0.00%      71.360us      23.787us             3  
+                                      aten::convolution         0.38%       8.820us         4.55%     106.222us      35.407us       0.000us         0.00%      71.360us      23.787us             3  
+                                     aten::_convolution         0.86%      20.169us         4.17%      97.402us      32.467us       0.000us         0.00%      71.360us      23.787us             3  
+                                aten::_conv_depthwise2d         0.88%      20.499us         2.70%      62.992us      20.997us      71.360us        68.74%      71.360us      23.787us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      71.360us        68.74%      71.360us      23.787us             3  
+                                               aten::to         0.25%       5.942us        88.99%       2.078ms     346.257us       0.000us         0.00%      38.526us       6.421us             6  
+                                         aten::_to_copy         0.97%      22.531us        88.74%       2.072ms     345.267us       0.000us         0.00%      38.526us       6.421us             6  
+                                            aten::copy_         1.95%      45.459us        86.50%       2.019ms     336.557us      32.447us        31.26%      38.526us       6.421us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.791us        17.14%      17.791us       5.930us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.656us        14.12%      14.656us       4.885us             3  
+                                Activity Buffer Request        76.44%       1.784ms        76.44%       1.784ms       1.784ms       6.079us         5.86%       6.079us       6.079us             1  
+                                    aten::empty_strided         1.27%      29.730us         1.27%      29.730us       4.955us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.13%     213.066us         9.13%     213.066us      23.674us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.66%      15.410us         0.85%      19.870us       2.208us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.33%       7.790us         0.33%       7.790us       0.519us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.44%      10.351us         0.44%      10.351us       3.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.36%       8.461us         0.36%       8.461us       2.820us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.401us         0.29%       6.691us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.064ms
-Self CUDA time total: 102.942us
+Self CPU time total: 2.335ms
+Self CUDA time total: 103.807us
 
 
 
@@ -4505,29 +4287,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     344.181us       304.53%     344.181us     344.181us             1  
-                                            torch_eager        14.98%     124.863us        99.35%     828.302us     828.302us       0.000us         0.00%     119.036us     119.036us             1  
-                                           aten::conv1d         0.70%       5.870us        14.55%     121.343us      40.448us       0.000us         0.00%      80.669us      26.890us             3  
-                                      aten::convolution         1.17%       9.720us        13.85%     115.473us      38.491us       0.000us         0.00%      80.669us      26.890us             3  
-                                     aten::_convolution         2.96%      24.691us        12.68%     105.753us      35.251us       0.000us         0.00%      80.669us      26.890us             3  
-                                aten::_conv_depthwise2d         2.65%      22.121us         7.65%      63.762us      21.254us      80.669us        71.38%      80.669us      26.890us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.669us        71.38%      80.669us      26.890us             3  
-                                               aten::to         0.77%       6.429us        66.53%     554.705us      92.451us       0.000us         0.00%      38.367us       6.394us             6  
-                                         aten::_to_copy         3.01%      25.101us        65.76%     548.276us      91.379us       0.000us         0.00%      38.367us       6.394us             6  
-                                            aten::copy_         6.16%      51.352us        59.05%     492.343us      82.057us      32.351us        28.62%      38.367us       6.394us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        15.66%      17.696us       5.899us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.655us        12.97%      14.655us       4.885us             3  
-                                Activity Buffer Request        28.81%     240.197us        28.81%     240.197us     240.197us       6.016us         5.32%       6.016us       6.016us             1  
-                                    aten::empty_strided         3.70%      30.832us         3.70%      30.832us       5.139us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.65%     222.174us        26.65%     222.174us      24.686us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.09%      17.401us         2.70%      22.541us       2.505us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.05%       8.790us         1.05%       8.790us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.34%      11.151us         1.34%      11.151us       3.717us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.09%       9.110us         1.09%       9.110us       3.037us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.89%       7.450us         1.05%       8.790us       2.930us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.032us       281.56%     320.032us     320.032us             1  
+                                            torch_eager         4.89%     112.502us        99.77%       2.297ms       2.297ms       0.000us         0.00%     119.649us     119.649us             1  
+                                           aten::conv1d         0.24%       5.540us         4.86%     111.980us      37.327us       0.000us         0.00%      81.407us      27.136us             3  
+                                      aten::convolution         0.38%       8.839us         4.62%     106.440us      35.480us       0.000us         0.00%      81.407us      27.136us             3  
+                                     aten::_convolution         0.90%      20.821us         4.24%      97.601us      32.534us       0.000us         0.00%      81.407us      27.136us             3  
+                                aten::_conv_depthwise2d         0.94%      21.639us         2.69%      61.990us      20.663us      81.407us        71.62%      81.407us      27.136us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      81.407us        71.62%      81.407us      27.136us             3  
+                                               aten::to         0.26%       5.912us        88.93%       2.047ms     341.211us       0.000us         0.00%      38.242us       6.374us             6  
+                                         aten::_to_copy         0.96%      22.099us        88.68%       2.041ms     340.225us       0.000us         0.00%      38.242us       6.374us             6  
+                                            aten::copy_         2.13%      49.062us        86.51%       1.991ms     331.902us      32.257us        28.38%      38.242us       6.374us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.665us        15.54%      17.665us       5.888us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        12.84%      14.592us       4.864us             3  
+                                Activity Buffer Request        76.05%       1.751ms        76.05%       1.751ms       1.751ms       5.985us         5.27%       5.985us       5.985us             1  
+                                    aten::empty_strided         1.21%      27.841us         1.21%      27.841us       4.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.26%     213.213us         9.26%     213.213us      23.690us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.150us         0.91%      21.061us       2.340us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.381us         0.36%       8.381us       0.559us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.130us         0.40%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       9.600us         0.42%       9.600us       3.200us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.419us         0.29%       6.669us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 833.752us
-Self CUDA time total: 113.020us
+Self CPU time total: 2.302ms
+Self CUDA time total: 113.664us
 
 
 
@@ -4537,29 +4319,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        14.21%     122.455us        95.83%     825.681us     825.681us       0.000us         0.00%     433.339us     433.339us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     419.771us       106.59%     419.771us     419.771us             1  
-                                           aten::conv1d         0.75%       6.429us        14.10%     121.522us      40.507us       0.000us         0.00%     251.453us      83.818us             3  
-                                      aten::convolution         1.15%       9.929us        13.36%     115.093us      38.364us       0.000us         0.00%     251.453us      83.818us             3  
-                                     aten::_convolution         2.67%      23.042us        12.21%     105.164us      35.055us       0.000us         0.00%     251.453us      83.818us             3  
-                                aten::_conv_depthwise2d         2.60%      22.440us         7.52%      64.810us      21.603us     251.453us        63.85%     251.453us      83.818us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.453us        63.85%     251.453us      83.818us             3  
-                                               aten::to         0.70%       6.001us        64.14%     552.672us      92.112us       0.000us         0.00%     181.886us      30.314us             6  
-                                         aten::_to_copy         2.73%      23.540us        63.45%     546.671us      91.112us       0.000us         0.00%     181.886us      30.314us             6  
-                                            aten::copy_         5.94%      51.140us        57.36%     494.211us      82.368us     142.367us        36.15%     181.886us      30.314us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.367us        25.99%     102.367us      34.122us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.000us        10.16%      40.000us      13.333us             3  
-                                Activity Buffer Request        29.04%     250.247us        29.04%     250.247us     250.247us      39.519us        10.03%      39.519us      39.519us             1  
-                                    aten::empty_strided         3.36%      28.920us         3.36%      28.920us       4.820us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.89%     214.494us        24.89%     214.494us      23.833us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.98%      17.062us         2.59%      22.273us       2.475us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       9.391us         1.09%       9.391us       0.626us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.24%      10.660us         1.24%      10.660us       3.553us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%      10.040us         1.17%      10.040us       3.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.86%       7.370us         1.02%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.70%     113.641us        96.03%       2.320ms       2.320ms       0.000us         0.00%     464.763us     464.763us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     453.786us       106.62%     453.786us     453.786us             1  
+                                           aten::conv1d         0.23%       5.630us         4.62%     111.673us      37.224us       0.000us         0.00%     278.940us      92.980us             3  
+                                      aten::convolution         0.36%       8.651us         4.39%     106.043us      35.348us       0.000us         0.00%     278.940us      92.980us             3  
+                                     aten::_convolution         0.86%      20.739us         4.03%      97.392us      32.464us       0.000us         0.00%     278.940us      92.980us             3  
+                                aten::_conv_depthwise2d         0.90%      21.710us         2.57%      62.062us      20.687us     278.940us        65.54%     278.940us      92.980us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     278.940us        65.54%     278.940us      92.980us             3  
+                                               aten::to         0.24%       5.880us        85.69%       2.071ms     345.102us       0.000us         0.00%     185.823us      30.970us             6  
+                                         aten::_to_copy         0.90%      21.820us        85.45%       2.065ms     344.122us       0.000us         0.00%     185.823us      30.970us             6  
+                                            aten::copy_         1.99%      48.071us        83.40%       2.015ms     335.882us     146.655us        34.46%     185.823us      30.970us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     105.919us        24.89%     105.919us      35.306us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.736us         9.57%      40.736us      13.579us             3  
+                                Activity Buffer Request        72.26%       1.746ms        72.26%       1.746ms       1.746ms      39.168us         9.20%      39.168us      39.168us             1  
+                                    aten::empty_strided         1.14%      27.621us         1.14%      27.621us       4.604us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.07%     243.344us        10.07%     243.344us      27.038us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.66%      15.908us         0.86%      20.760us       2.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.34%       8.262us         0.34%       8.262us       0.551us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.37%       8.921us         0.37%       8.921us       2.974us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.38%       9.260us         0.38%       9.260us       3.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.361us         0.27%       6.641us       2.214us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 861.602us
-Self CUDA time total: 393.820us
+Self CPU time total: 2.416ms
+Self CUDA time total: 425.595us
 
 
 
@@ -4569,29 +4351,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        15.32%     134.312us        91.67%     803.971us     803.971us       0.000us         0.00%     487.924us     487.924us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     476.501us       106.34%     476.501us     476.501us             1  
-                                           aten::conv1d         0.67%       5.860us        13.82%     121.173us      40.391us       0.000us         0.00%     299.161us      99.720us             3  
-                                      aten::convolution         1.17%      10.220us        13.15%     115.313us      38.438us       0.000us         0.00%     299.161us      99.720us             3  
-                                     aten::_convolution         2.67%      23.450us        11.98%     105.093us      35.031us       0.000us         0.00%     299.161us      99.720us             3  
-                                aten::_conv_depthwise2d         2.56%      22.451us         7.48%      65.623us      21.874us     299.161us        66.76%     299.161us      99.720us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     299.161us        66.76%     299.161us      99.720us             3  
-                                               aten::to         0.69%       6.051us        59.17%     518.906us      86.484us       0.000us         0.00%     188.763us      31.460us             6  
-                                         aten::_to_copy         2.71%      23.771us        58.48%     512.855us      85.476us       0.000us         0.00%     188.763us      31.460us             6  
-                                            aten::copy_         5.69%      49.880us        52.31%     458.742us      76.457us     148.924us        33.24%     188.763us      31.460us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.861us        24.29%     108.861us      36.287us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.063us         8.94%      40.063us      13.354us             3  
-                                Activity Buffer Request        25.01%     219.366us        25.01%     219.366us     219.366us      39.839us         8.89%      39.839us      39.839us             1  
-                                    aten::empty_strided         3.46%      30.342us         3.46%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.34%     213.439us        24.34%     213.439us      23.715us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.98%      17.400us         2.59%      22.720us       2.524us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       9.540us         1.09%       9.540us       0.636us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.14%      10.010us         1.14%      10.010us       3.337us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.05%       9.219us         1.05%       9.219us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.66%       5.750us         0.82%       7.210us       2.403us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.81%     115.230us        95.51%       2.289ms       2.289ms       0.000us         0.00%     473.560us     473.560us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     466.268us       106.59%     466.268us     466.268us             1  
+                                           aten::conv1d         0.23%       5.540us         4.63%     111.002us      37.001us       0.000us         0.00%     298.430us      99.477us             3  
+                                      aten::convolution         0.37%       8.900us         4.40%     105.462us      35.154us       0.000us         0.00%     298.430us      99.477us             3  
+                                     aten::_convolution         0.85%      20.430us         4.03%      96.562us      32.187us       0.000us         0.00%     298.430us      99.477us             3  
+                                aten::_conv_depthwise2d         0.86%      20.562us         2.57%      61.592us      20.531us     298.430us        68.22%     298.430us      99.477us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.430us        68.22%     298.430us      99.477us             3  
+                                               aten::to         0.24%       5.669us        85.05%       2.039ms     339.802us       0.000us         0.00%     175.130us      29.188us             6  
+                                         aten::_to_copy         0.96%      22.942us        84.82%       2.033ms     338.857us       0.000us         0.00%     175.130us      29.188us             6  
+                                            aten::copy_         2.01%      48.190us        82.64%       1.981ms     330.170us     139.003us        31.78%     175.130us      29.188us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      98.430us        22.50%      98.430us      32.810us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.573us         9.28%      40.573us      13.524us             3  
+                                Activity Buffer Request        72.81%       1.745ms        72.81%       1.745ms       1.745ms      36.127us         8.26%      36.127us      36.127us             1  
+                                    aten::empty_strided         1.22%      29.180us         1.22%      29.180us       4.863us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.73%     209.224us         8.73%     209.224us      23.247us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.66%      15.770us         0.87%      20.750us       2.306us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.35%       8.340us         0.35%       8.340us       0.556us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%      10.290us         0.43%      10.290us       3.430us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       8.960us         0.37%       8.960us       2.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.340us         0.28%       6.610us       2.203us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 876.983us
-Self CUDA time total: 448.085us
+Self CPU time total: 2.397ms
+Self CUDA time total: 437.433us
 
 
 
@@ -4601,29 +4383,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.392us      1804.85%     338.392us     338.392us             1  
-                                            torch_eager        18.33%     161.236us        99.35%     873.703us     873.703us       0.000us         0.00%      20.637us      20.637us             1  
-                                               aten::to         0.69%       6.070us        63.71%     560.224us      93.371us       0.000us         0.00%      13.406us       2.234us             6  
-                                         aten::_to_copy         2.78%      24.471us        63.02%     554.154us      92.359us       0.000us         0.00%      13.406us       2.234us             6  
-                                            aten::copy_         5.94%      52.212us        56.85%     499.953us      83.325us      11.518us        61.43%      13.406us       2.234us             6  
-                                           aten::conv1d         0.64%       5.659us        14.02%     123.282us      41.094us       0.000us         0.00%       7.231us       2.410us             3  
-                                      aten::convolution         1.14%       9.999us        13.38%     117.623us      39.208us       0.000us         0.00%       7.231us       2.410us             3  
-                                     aten::_convolution         2.72%      23.952us        12.24%     107.624us      35.875us       0.000us         0.00%       7.231us       2.410us             3  
-                                aten::_conv_depthwise2d         2.67%      23.519us         7.63%      67.130us      22.377us       7.231us        38.57%       7.231us       2.410us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        38.57%       7.231us       2.410us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.854us        31.22%       5.854us       1.951us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.21%       5.664us       1.888us             3  
-                                Activity Buffer Request        29.52%     259.596us        29.52%     259.596us     259.596us       1.888us        10.07%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.38%      29.730us         3.38%      29.730us       4.955us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.99%     210.946us        23.99%     210.946us      23.438us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.07%      18.190us         2.71%      23.871us       2.652us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.11%       9.761us         1.11%       9.761us       0.651us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.24%      10.890us         1.24%      10.890us       3.630us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.13%       9.920us         1.13%       9.920us       3.307us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.972us         0.85%       7.452us       2.484us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.149us      1725.02%     325.149us     325.149us             1  
+                                            torch_eager         4.86%     112.628us        99.78%       2.311ms       2.311ms       0.000us         0.00%      20.769us      20.769us             1  
+                                               aten::to         0.26%       5.932us        88.67%       2.054ms     342.251us       0.000us         0.00%      13.536us       2.256us             6  
+                                         aten::_to_copy         1.00%      23.270us        88.41%       2.048ms     341.262us       0.000us         0.00%      13.536us       2.256us             6  
+                                            aten::copy_         2.14%      49.511us        86.15%       1.995ms     332.552us      11.616us        61.63%      13.536us       2.256us             6  
+                                           aten::conv1d         0.24%       5.480us         5.19%     120.221us      40.074us       0.000us         0.00%       7.233us       2.411us             3  
+                                      aten::convolution         0.37%       8.641us         4.95%     114.741us      38.247us       0.000us         0.00%       7.233us       2.411us             3  
+                                     aten::_convolution         0.88%      20.361us         4.58%     106.100us      35.367us       0.000us         0.00%       7.233us       2.411us             3  
+                                aten::_conv_depthwise2d         0.96%      22.180us         3.05%      70.680us      23.560us       7.233us        38.37%       7.233us       2.411us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.233us        38.37%       7.233us       2.411us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.41%       5.920us       1.973us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.22%       5.696us       1.899us             3  
+                                Activity Buffer Request        75.90%       1.758ms        75.90%       1.758ms       1.758ms       1.920us        10.19%       1.920us       1.920us             1  
+                                    aten::empty_strided         1.25%      28.990us         1.25%      28.990us       4.832us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.42%     218.162us         9.42%     218.162us      24.240us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.68%      15.833us         0.90%      20.731us       2.303us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.468us         0.37%       8.468us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.220us         0.40%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       8.980us         0.39%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.550us         0.30%       7.000us       2.333us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 879.393us
-Self CUDA time total: 18.749us
+Self CPU time total: 2.316ms
+Self CUDA time total: 18.849us
 
 
 
@@ -4633,29 +4415,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.934us      1741.87%     338.934us     338.934us             1  
-                                            torch_eager        16.71%     145.362us        99.29%     863.592us     863.592us       0.000us         0.00%      21.314us      21.314us             1  
-                                               aten::to         0.71%       6.200us        65.36%     568.524us      94.754us       0.000us         0.00%      13.282us       2.214us             6  
-                                         aten::_to_copy         2.85%      24.831us        64.65%     562.324us      93.721us       0.000us         0.00%      13.282us       2.214us             6  
-                                            aten::copy_         5.81%      50.550us        58.39%     507.883us      84.647us      11.426us        58.72%      13.282us       2.214us             6  
-                                           aten::conv1d         0.78%       6.753us        14.06%     122.315us      40.772us       0.000us         0.00%       8.032us       2.677us             3  
-                                      aten::convolution         1.19%      10.380us        13.29%     115.562us      38.521us       0.000us         0.00%       8.032us       2.677us             3  
-                                     aten::_convolution         2.63%      22.841us        12.09%     105.182us      35.061us       0.000us         0.00%       8.032us       2.677us             3  
-                                aten::_conv_depthwise2d         2.65%      23.042us         7.65%      66.512us      22.171us       8.032us        41.28%       8.032us       2.677us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        41.28%       8.032us       2.677us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.825us        29.94%       5.825us       1.942us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.601us        28.79%       5.601us       1.867us             3  
-                                Activity Buffer Request        30.62%     266.307us        30.62%     266.307us     266.307us       1.856us         9.54%       1.856us       1.856us             1  
-                                    aten::empty_strided         3.40%      29.610us         3.40%      29.610us       4.935us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.61%     214.076us        24.61%     214.076us      23.786us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.02%      17.612us         2.63%      22.841us       2.538us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.02%       8.840us         1.02%       8.840us       0.589us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%      10.630us         1.22%      10.630us       3.543us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.13%       9.790us         1.13%       9.790us       3.263us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.67%       5.798us         0.82%       7.109us       2.370us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.511us      1636.76%     320.511us     320.511us             1  
+                                            torch_eager         5.91%     139.372us        99.79%       2.353ms       2.353ms       0.000us         0.00%      21.598us      21.598us             1  
+                                               aten::to         0.25%       6.010us        87.93%       2.073ms     345.496us       0.000us         0.00%      13.663us       2.277us             6  
+                                         aten::_to_copy         0.96%      22.549us        87.67%       2.067ms     344.494us       0.000us         0.00%      13.663us       2.277us             6  
+                                            aten::copy_         2.09%      49.251us        85.51%       2.016ms     335.977us      11.647us        59.48%      13.663us       2.277us             6  
+                                           aten::conv1d         0.26%       6.081us         4.89%     115.321us      38.440us       0.000us         0.00%       7.935us       2.645us             3  
+                                      aten::convolution         0.40%       9.450us         4.63%     109.240us      36.413us       0.000us         0.00%       7.935us       2.645us             3  
+                                     aten::_convolution         0.90%      21.168us         4.23%      99.790us      33.263us       0.000us         0.00%       7.935us       2.645us             3  
+                                aten::_conv_depthwise2d         0.87%      20.610us         2.67%      62.871us      20.957us       7.935us        40.52%       7.935us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        40.52%       7.935us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        30.55%       5.983us       1.994us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        28.92%       5.664us       1.888us             3  
+                                Activity Buffer Request        75.47%       1.779ms        75.47%       1.779ms       1.779ms       2.016us        10.30%       2.016us       2.016us             1  
+                                    aten::empty_strided         1.21%      28.551us         1.21%      28.551us       4.759us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.91%     210.105us         8.91%     210.105us      23.345us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.72%      16.961us         0.93%      21.872us       2.430us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.422us         0.36%       8.422us       0.561us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%      10.910us         0.46%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       8.650us         0.37%       8.650us       2.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.579us         0.30%       6.970us       2.323us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 869.783us
-Self CUDA time total: 19.458us
+Self CPU time total: 2.358ms
+Self CUDA time total: 19.582us
 
 
 
@@ -4665,29 +4447,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.862us      1751.78%     340.862us     340.862us             1  
-                                            torch_eager         8.44%     173.073us        99.74%       2.045ms       2.045ms       0.000us         0.00%      21.635us      21.635us             1  
-                                               aten::to         0.33%       6.670us        84.06%       1.723ms     287.196us       0.000us         0.00%      14.307us       2.385us             6  
-                                         aten::_to_copy         1.21%      24.883us        83.74%       1.717ms     286.084us       0.000us         0.00%      14.307us       2.385us             6  
-                                            aten::copy_         2.36%      48.471us        81.06%       1.662ms     276.949us      12.130us        62.34%      14.307us       2.385us             6  
-                                           aten::conv1d         0.29%       5.970us         5.84%     119.613us      39.871us       0.000us         0.00%       7.328us       2.443us             3  
-                                      aten::convolution         0.48%       9.780us         5.54%     113.643us      37.881us       0.000us         0.00%       7.328us       2.443us             3  
-                                     aten::_convolution         1.14%      23.420us         5.07%     103.863us      34.621us       0.000us         0.00%       7.328us       2.443us             3  
-                                aten::_conv_depthwise2d         1.10%      22.512us         3.15%      64.503us      21.501us       7.328us        37.66%       7.328us       2.443us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.66%       7.328us       2.443us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.241us        32.07%       6.241us       2.080us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us        30.27%       5.889us       1.963us             3  
-                                Activity Buffer Request        69.34%       1.421ms        69.34%       1.421ms       1.421ms       2.177us        11.19%       2.177us       2.177us             1  
-                                    aten::empty_strided         1.46%      29.930us         1.46%      29.930us       4.988us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.50%     215.256us        10.50%     215.256us      23.917us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      17.669us         1.13%      23.180us       2.576us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.581us         0.47%       9.581us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.759us         0.48%       9.759us       3.253us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       8.742us         0.43%       8.742us       2.914us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.760us         0.35%       7.110us       2.370us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     310.009us      1591.01%     310.009us     310.009us             1  
+                                            torch_eager        14.85%     113.881us        99.35%     762.102us     762.102us       0.000us         0.00%      21.693us      21.693us             1  
+                                               aten::to         0.75%       5.742us        67.36%     516.710us      86.118us       0.000us         0.00%      14.398us       2.400us             6  
+                                         aten::_to_copy         2.84%      21.798us        66.61%     510.968us      85.161us       0.000us         0.00%      14.398us       2.400us             6  
+                                            aten::copy_         6.26%      48.021us        59.81%     458.808us      76.468us      12.190us        62.56%      14.398us       2.400us             6  
+                                           aten::conv1d         0.69%       5.290us        14.07%     107.951us      35.984us       0.000us         0.00%       7.295us       2.432us             3  
+                                      aten::convolution         1.14%       8.770us        13.38%     102.661us      34.220us       0.000us         0.00%       7.295us       2.432us             3  
+                                     aten::_convolution         2.56%      19.629us        12.24%      93.891us      31.297us       0.000us         0.00%       7.295us       2.432us             3  
+                                aten::_conv_depthwise2d         2.72%      20.851us         7.84%      60.152us      20.051us       7.295us        37.44%       7.295us       2.432us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.295us        37.44%       7.295us       2.432us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.271us        32.18%       6.271us       2.090us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.38%       5.919us       1.973us             3  
+                                Activity Buffer Request        29.70%     227.833us        29.70%     227.833us     227.833us       2.208us        11.33%       2.208us       2.208us             1  
+                                    aten::empty_strided         3.96%      30.362us         3.96%      30.362us       5.060us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.62%     204.185us        26.62%     204.185us      22.687us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      15.431us         2.57%      19.700us       2.189us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.98%       7.520us         0.98%       7.520us       0.501us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.29%       9.930us         1.29%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.06%       8.140us         1.06%       8.140us       2.713us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.67%       5.119us         0.83%       6.400us       2.133us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.050ms
-Self CUDA time total: 19.458us
+Self CPU time total: 767.122us
+Self CUDA time total: 19.485us
 
 
 
@@ -4697,29 +4479,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     367.067us      1820.95%     367.067us     367.067us             1  
-                                            torch_eager        17.50%     145.595us        99.30%     826.111us     826.111us       0.000us         0.00%      22.366us      22.366us             1  
-                                               aten::to         0.75%       6.199us        63.72%     530.082us      88.347us       0.000us         0.00%      14.431us       2.405us             6  
-                                         aten::_to_copy         2.95%      24.573us        62.97%     523.883us      87.314us       0.000us         0.00%      14.431us       2.405us             6  
-                                            aten::copy_         6.31%      52.521us        56.15%     467.170us      77.862us      12.223us        60.64%      14.431us       2.405us             6  
-                                           aten::conv1d         0.69%       5.760us        14.59%     121.354us      40.451us       0.000us         0.00%       7.935us       2.645us             3  
-                                      aten::convolution         1.24%      10.281us        13.89%     115.594us      38.531us       0.000us         0.00%       7.935us       2.645us             3  
-                                     aten::_convolution         2.68%      22.269us        12.66%     105.313us      35.104us       0.000us         0.00%       7.935us       2.645us             3  
-                                aten::_conv_depthwise2d         2.73%      22.701us         8.02%      66.711us      22.237us       7.935us        39.36%       7.935us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        39.36%       7.935us       2.645us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        31.27%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        29.36%       5.919us       1.973us             3  
-                                Activity Buffer Request        27.00%     224.665us        27.00%     224.665us     224.665us       2.208us        10.95%       2.208us       2.208us             1  
-                                    aten::empty_strided         3.86%      32.140us         3.86%      32.140us       5.357us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.71%     213.894us        25.71%     213.894us      23.766us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.05%      17.041us         2.71%      22.553us       2.506us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.14%       9.503us         1.14%       9.503us       0.634us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.31%      10.920us         1.31%      10.920us       3.640us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.10%       9.180us         1.10%       9.180us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.81%       6.740us         0.98%       8.160us       2.720us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.058us      1547.83%     312.058us     312.058us             1  
+                                            torch_eager        19.84%     167.701us        99.34%     839.603us     839.603us       0.000us         0.00%      22.369us      22.369us             1  
+                                               aten::to         0.69%       5.791us        63.55%     537.169us      89.528us       0.000us         0.00%      14.400us       2.400us             6  
+                                         aten::_to_copy         2.59%      21.910us        62.87%     531.378us      88.563us       0.000us         0.00%      14.400us       2.400us             6  
+                                            aten::copy_         5.79%      48.970us        56.91%     481.028us      80.171us      12.192us        60.47%      14.400us       2.400us             6  
+                                           aten::conv1d         0.65%       5.520us        13.10%     110.752us      36.917us       0.000us         0.00%       7.969us       2.656us             3  
+                                      aten::convolution         1.03%       8.700us        12.45%     105.232us      35.077us       0.000us         0.00%       7.969us       2.656us             3  
+                                     aten::_convolution         2.40%      20.311us        11.42%      96.532us      32.177us       0.000us         0.00%       7.969us       2.656us             3  
+                                aten::_conv_depthwise2d         2.39%      20.240us         7.28%      61.521us      20.507us       7.969us        39.53%       7.969us       2.656us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.969us        39.53%       7.969us       2.656us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        31.11%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.36%       5.920us       1.973us             3  
+                                Activity Buffer Request        29.19%     246.714us        29.19%     246.714us     246.714us       2.208us        10.95%       2.208us       2.208us             1  
+                                    aten::empty_strided         3.36%      28.440us         3.36%      28.440us       4.740us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.70%     208.775us        24.70%     208.775us      23.197us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.84%      15.580us         2.41%      20.350us       2.261us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.95%       8.049us         0.95%       8.049us       0.537us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.07%       9.050us         1.07%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.04%       8.800us         1.04%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.63%       5.361us         0.79%       6.650us       2.217us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 831.951us
-Self CUDA time total: 20.158us
+Self CPU time total: 845.213us
+Self CUDA time total: 20.161us
 
 
 
@@ -4729,29 +4511,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     363.100us      1005.93%     363.100us     363.100us             1  
-                                            torch_eager        14.77%     122.163us        99.35%     821.971us     821.971us       0.000us         0.00%      38.688us      38.688us             1  
-                                           aten::conv1d         0.72%       5.951us        17.29%     143.024us      47.675us       0.000us         0.00%      20.160us       6.720us             3  
-                                      aten::convolution         1.22%      10.110us        16.57%     137.073us      45.691us       0.000us         0.00%      20.160us       6.720us             3  
-                                     aten::_convolution         3.04%      25.151us        15.35%     126.963us      42.321us       0.000us         0.00%      20.160us       6.720us             3  
-                                aten::_conv_depthwise2d         4.80%      39.711us        10.31%      85.271us      28.424us      20.160us        55.85%      20.160us       6.720us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.160us        55.85%      20.160us       6.720us             3  
-                                               aten::to         0.75%       6.172us        63.79%     527.804us      87.967us       0.000us         0.00%      18.528us       3.088us             6  
-                                         aten::_to_copy         2.99%      24.751us        63.05%     521.632us      86.939us       0.000us         0.00%      18.528us       3.088us             6  
-                                            aten::copy_         6.14%      50.790us        56.45%     467.021us      77.837us      15.936us        44.15%      18.528us       3.088us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.512us        23.58%       8.512us       2.837us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        20.57%       7.424us       2.475us             3  
-                                Activity Buffer Request        27.93%     231.066us        27.93%     231.066us     231.066us       2.592us         7.18%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.61%      29.860us         3.61%      29.860us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.33%     209.585us        25.33%     209.585us      23.287us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.11%      17.441us         2.75%      22.791us       2.532us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.15%       9.501us         1.15%       9.501us       0.633us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%      10.400us         1.26%      10.400us       3.467us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.30%      10.740us         1.30%      10.740us       3.580us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.76%       6.269us         0.93%       7.730us       2.577us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.867us       859.95%     312.867us     312.867us             1  
+                                            torch_eager        14.44%     112.752us        99.36%     776.042us     776.042us       0.000us         0.00%      39.006us      39.006us             1  
+                                           aten::conv1d         0.71%       5.580us        13.99%     109.252us      36.417us       0.000us         0.00%      20.512us       6.837us             3  
+                                      aten::convolution         1.09%       8.531us        13.27%     103.672us      34.557us       0.000us         0.00%      20.512us       6.837us             3  
+                                     aten::_convolution         2.62%      20.459us        12.18%      95.141us      31.714us       0.000us         0.00%      20.512us       6.837us             3  
+                                aten::_conv_depthwise2d         2.59%      20.222us         7.70%      60.162us      20.054us      20.512us        56.38%      20.512us       6.837us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.512us        56.38%      20.512us       6.837us             3  
+                                               aten::to         0.75%       5.821us        67.81%     529.608us      88.268us       0.000us         0.00%      18.494us       3.082us             6  
+                                         aten::_to_copy         2.86%      22.338us        67.06%     523.787us      87.298us       0.000us         0.00%      18.494us       3.082us             6  
+                                            aten::copy_         6.02%      47.020us        60.45%     472.148us      78.691us      15.870us        43.62%      18.494us       3.082us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        23.22%       8.447us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.423us        20.40%       7.423us       2.474us             3  
+                                Activity Buffer Request        30.80%     240.594us        30.80%     240.594us     240.594us       2.624us         7.21%       2.624us       2.624us             1  
+                                    aten::empty_strided         3.75%      29.301us         3.75%      29.301us       4.884us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.46%     206.633us        26.46%     206.633us      22.959us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      15.720us         2.61%      20.410us       2.268us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       7.981us         1.02%       7.981us       0.532us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.13%       8.841us         1.13%       8.841us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       9.000us         1.15%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.329us         0.84%       6.560us       2.187us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 827.381us
-Self CUDA time total: 36.096us
+Self CPU time total: 781.073us
+Self CUDA time total: 36.382us
 
 
 
@@ -4761,29 +4543,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.025us       883.88%     336.025us     336.025us             1  
-                                            torch_eager        14.70%     120.902us        99.36%     817.351us     817.351us       0.000us         0.00%      40.610us      40.610us             1  
-                                           aten::conv1d         0.71%       5.820us        14.44%     118.823us      39.608us       0.000us         0.00%      22.304us       7.435us             3  
-                                      aten::convolution         1.12%       9.190us        13.74%     113.003us      37.668us       0.000us         0.00%      22.304us       7.435us             3  
-                                     aten::_convolution         2.83%      23.270us        12.62%     103.813us      34.604us       0.000us         0.00%      22.304us       7.435us             3  
-                                aten::_conv_depthwise2d         2.83%      23.309us         7.79%      64.072us      21.357us      22.304us        58.67%      22.304us       7.435us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.304us        58.67%      22.304us       7.435us             3  
-                                               aten::to         0.73%       5.990us        66.75%     549.075us      91.513us       0.000us         0.00%      18.306us       3.051us             6  
-                                         aten::_to_copy         2.91%      23.953us        66.02%     543.085us      90.514us       0.000us         0.00%      18.306us       3.051us             6  
-                                            aten::copy_         6.07%      49.902us        59.57%     490.042us      81.674us      15.713us        41.33%      18.306us       3.051us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.353us        21.97%       8.353us       2.784us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.36%       7.360us       2.453us             3  
-                                Activity Buffer Request        30.85%     253.806us        30.85%     253.806us     253.806us       2.593us         6.82%       2.593us       2.593us             1  
-                                    aten::empty_strided         3.54%      29.090us         3.54%      29.090us       4.848us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.29%     208.074us        25.29%     208.074us      23.119us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.19%      18.051us         2.84%      23.371us       2.597us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.11%       9.160us         1.11%       9.160us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.21%       9.961us         1.21%       9.961us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.10%       9.062us         1.10%       9.062us       3.021us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.80%       6.580us         0.96%       7.920us       2.640us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     353.311us       916.31%     353.311us     353.311us             1  
+                                            torch_eager        17.31%     144.171us        99.40%     827.943us     827.943us       0.000us         0.00%      41.150us      41.150us             1  
+                                           aten::conv1d         0.66%       5.470us        14.12%     117.601us      39.200us       0.000us         0.00%      22.624us       7.541us             3  
+                                      aten::convolution         1.09%       9.120us        13.46%     112.131us      37.377us       0.000us         0.00%      22.624us       7.541us             3  
+                                     aten::_convolution         2.77%      23.100us        12.37%     103.011us      34.337us       0.000us         0.00%      22.624us       7.541us             3  
+                                aten::_conv_depthwise2d         2.63%      21.901us         7.78%      64.791us      21.597us      22.624us        58.68%      22.624us       7.541us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.624us        58.68%      22.624us       7.541us             3  
+                                               aten::to         0.71%       5.920us        64.88%     540.450us      90.075us       0.000us         0.00%      18.526us       3.088us             6  
+                                         aten::_to_copy         2.59%      21.613us        64.17%     534.530us      89.088us       0.000us         0.00%      18.526us       3.088us             6  
+                                            aten::copy_         5.88%      48.990us        58.06%     483.646us      80.608us      15.934us        41.32%      18.526us       3.088us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.575us        22.24%       8.575us       2.858us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.09%       7.359us       2.453us             3  
+                                Activity Buffer Request        29.91%     249.164us        29.91%     249.164us     249.164us       2.592us         6.72%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.51%      29.271us         3.51%      29.271us       4.879us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.18%     209.712us        25.18%     209.712us      23.301us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.99%      16.542us         2.59%      21.611us       2.401us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.04%       8.638us         1.04%       8.638us       0.576us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.650us         1.16%       9.650us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.08%       9.020us         1.08%       9.020us       3.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.681us         0.85%       7.060us       2.353us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 822.611us
-Self CUDA time total: 38.017us
+Self CPU time total: 832.973us
+Self CUDA time total: 38.558us
 
 
 
@@ -4793,29 +4575,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.486us       522.89%     335.486us     335.486us             1  
-                                            torch_eager        15.29%     123.163us        99.38%     800.491us     800.491us       0.000us         0.00%      68.256us      68.256us             1  
-                                           aten::conv1d         0.73%       5.840us        14.87%     119.763us      39.921us       0.000us         0.00%      41.760us      13.920us             3  
-                                      aten::convolution         1.21%       9.761us        14.14%     113.923us      37.974us       0.000us         0.00%      41.760us      13.920us             3  
-                                     aten::_convolution         2.84%      22.911us        12.93%     104.162us      34.721us       0.000us         0.00%      41.760us      13.920us             3  
-                                aten::_conv_depthwise2d         2.80%      22.570us         8.02%      64.572us      21.524us      41.760us        65.09%      41.760us      13.920us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.760us        65.09%      41.760us      13.920us             3  
-                                               aten::to         0.73%       5.842us        65.67%     528.904us      88.151us       0.000us         0.00%      26.496us       4.416us             6  
-                                         aten::_to_copy         2.94%      23.712us        64.94%     523.062us      87.177us       0.000us         0.00%      26.496us       4.416us             6  
-                                            aten::copy_         6.02%      48.492us        58.29%     469.521us      78.253us      22.400us        34.91%      26.496us       4.416us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        18.65%      11.968us       3.989us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.26%      10.432us       3.477us             3  
-                                Activity Buffer Request        29.33%     236.206us        29.33%     236.206us     236.206us       4.096us         6.38%       4.096us       4.096us             1  
-                                    aten::empty_strided         3.70%      29.829us         3.70%      29.829us       4.971us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.91%     208.693us        25.91%     208.693us      23.188us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.18%      17.569us         2.86%      23.069us       2.563us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.14%       9.222us         1.14%       9.222us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.20%       9.631us         1.20%       9.631us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.06%       8.501us         1.06%       8.501us       2.834us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.83%       6.660us         0.99%       7.990us       2.663us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.829us       488.45%     316.829us     316.829us             1  
+                                            torch_eager        14.19%     114.002us        99.33%     798.183us     798.183us       0.000us         0.00%      68.991us      68.991us             1  
+                                           aten::conv1d         0.68%       5.460us        13.80%     110.892us      36.964us       0.000us         0.00%      42.304us      14.101us             3  
+                                      aten::convolution         1.10%       8.859us        13.12%     105.432us      35.144us       0.000us         0.00%      42.304us      14.101us             3  
+                                     aten::_convolution         2.59%      20.821us        12.02%      96.573us      32.191us       0.000us         0.00%      42.304us      14.101us             3  
+                                aten::_conv_depthwise2d         2.64%      21.190us         7.50%      60.251us      20.084us      42.304us        65.22%      42.304us      14.101us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      42.304us        65.22%      42.304us      14.101us             3  
+                                               aten::to         0.75%       6.059us        68.35%     549.177us      91.530us       0.000us         0.00%      26.687us       4.448us             6  
+                                         aten::_to_copy         2.76%      22.169us        67.59%     543.118us      90.520us       0.000us         0.00%      26.687us       4.448us             6  
+                                            aten::copy_         6.74%      54.161us        61.27%     492.308us      82.051us      22.560us        34.78%      26.687us       4.448us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.095us        18.65%      12.095us       4.032us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.465us        16.13%      10.465us       3.488us             3  
+                                Activity Buffer Request        31.75%     255.134us        31.75%     255.134us     255.134us       4.127us         6.36%       4.127us       4.127us             1  
+                                    aten::empty_strided         3.56%      28.641us         3.56%      28.641us       4.773us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.49%     204.843us        25.49%     204.843us      22.760us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.06%      16.521us         2.65%      21.322us       2.369us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       8.171us         1.02%       8.171us       0.545us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%       9.170us         1.14%       9.170us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.00%       8.061us         1.00%       8.061us       2.687us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.66%       5.330us         0.81%       6.520us       2.173us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 805.451us
-Self CUDA time total: 64.160us
+Self CPU time total: 803.533us
+Self CUDA time total: 64.864us
 
 
 
@@ -4825,29 +4607,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.218us       487.48%     340.218us     340.218us             1  
-                                            torch_eager        15.18%     124.853us        99.38%     817.682us     817.682us       0.000us         0.00%      73.887us      73.887us             1  
-                                           aten::conv1d         0.72%       5.910us        14.57%     119.903us      39.968us       0.000us         0.00%      47.328us      15.776us             3  
-                                      aten::convolution         1.21%       9.960us        13.86%     113.993us      37.998us       0.000us         0.00%      47.328us      15.776us             3  
-                                     aten::_convolution         2.81%      23.101us        12.64%     104.033us      34.678us       0.000us         0.00%      47.328us      15.776us             3  
-                                aten::_conv_depthwise2d         2.62%      21.561us         7.83%      64.432us      21.477us      47.328us        67.81%      47.328us      15.776us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.328us        67.81%      47.328us      15.776us             3  
-                                               aten::to         0.75%       6.180us        66.30%     545.475us      90.913us       0.000us         0.00%      26.559us       4.426us             6  
-                                         aten::_to_copy         2.97%      24.459us        65.55%     539.295us      89.882us       0.000us         0.00%      26.559us       4.426us             6  
-                                            aten::copy_         6.14%      50.491us        58.93%     484.862us      80.810us      22.463us        32.19%      26.559us       4.426us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        17.24%      12.032us       4.011us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.95%      10.431us       3.477us             3  
-                                Activity Buffer Request        30.21%     248.576us        30.21%     248.576us     248.576us       4.096us         5.87%       4.096us       4.096us             1  
-                                    aten::empty_strided         3.64%      29.974us         3.64%      29.974us       4.996us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.32%     208.345us        25.32%     208.345us      23.149us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.09%      17.201us         2.72%      22.401us       2.489us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.11%       9.120us         1.11%       9.120us       0.608us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.32%      10.899us         1.32%      10.899us       3.633us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       9.422us         1.15%       9.422us       3.141us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.80%       6.580us         0.98%       8.070us       2.690us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.383us       466.25%     328.383us     328.383us             1  
+                                            torch_eager         5.82%     138.672us        99.78%       2.376ms       2.376ms       0.000us         0.00%      74.527us      74.527us             1  
+                                           aten::conv1d         0.24%       5.689us         4.87%     115.970us      38.657us       0.000us         0.00%      47.969us      15.990us             3  
+                                      aten::convolution         0.43%      10.191us         4.63%     110.281us      36.760us       0.000us         0.00%      47.969us      15.990us             3  
+                                     aten::_convolution         0.91%      21.579us         4.20%     100.090us      33.363us       0.000us         0.00%      47.969us      15.990us             3  
+                                aten::_conv_depthwise2d         0.87%      20.670us         2.63%      62.670us      20.890us      47.969us        68.11%      47.969us      15.990us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.969us        68.11%      47.969us      15.990us             3  
+                                               aten::to         0.27%       6.430us        88.04%       2.097ms     349.464us       0.000us         0.00%      26.558us       4.426us             6  
+                                         aten::_to_copy         0.99%      23.642us        87.77%       2.090ms     348.392us       0.000us         0.00%      26.558us       4.426us             6  
+                                            aten::copy_         2.06%      49.120us        85.54%       2.037ms     339.525us      22.462us        31.89%      26.558us       4.426us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.999us        17.04%      11.999us       4.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.463us        14.86%      10.463us       3.488us             3  
+                                Activity Buffer Request        75.66%       1.802ms        75.66%       1.802ms       1.802ms       4.096us         5.82%       4.096us       4.096us             1  
+                                    aten::empty_strided         1.24%      29.560us         1.24%      29.560us       4.927us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.75%     208.373us         8.75%     208.373us      23.153us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.782us         0.92%      21.972us       2.441us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.520us         0.36%       8.520us       0.568us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.38%       9.160us         0.38%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%      10.580us         0.44%      10.580us       3.527us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       5.730us         0.29%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 822.752us
-Self CUDA time total: 69.791us
+Self CPU time total: 2.382ms
+Self CUDA time total: 70.431us
 
 
 
@@ -4857,29 +4639,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.276us       192.10%     357.276us     357.276us             1  
-                                            torch_eager         7.25%     148.445us        99.75%       2.043ms       2.043ms       0.000us         0.00%     196.063us     196.063us             1  
-                                           aten::conv1d         0.28%       5.714us         6.04%     123.725us      41.242us       0.000us         0.00%     133.535us      44.512us             3  
-                                      aten::convolution         0.50%      10.209us         5.76%     118.011us      39.337us       0.000us         0.00%     133.535us      44.512us             3  
-                                     aten::_convolution         1.22%      24.922us         5.26%     107.802us      35.934us       0.000us         0.00%     133.535us      44.512us             3  
-                                aten::_conv_depthwise2d         1.06%      21.740us         3.25%      66.540us      22.180us     133.535us        71.80%     133.535us      44.512us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.535us        71.80%     133.535us      44.512us             3  
-                                               aten::to         0.32%       6.558us        85.01%       1.741ms     290.215us       0.000us         0.00%      62.528us      10.421us             6  
-                                         aten::_to_copy         1.28%      26.242us        84.69%       1.735ms     289.122us       0.000us         0.00%      62.528us      10.421us             6  
-                                            aten::copy_         2.37%      48.539us        81.91%       1.678ms     279.634us      52.448us        28.20%      62.528us      10.421us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.536us        15.88%      29.536us       9.845us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        12.32%      22.912us       7.637us             3  
-                                Activity Buffer Request        70.45%       1.443ms        70.45%       1.443ms       1.443ms      10.080us         5.42%      10.080us      10.080us             1  
-                                    aten::empty_strided         1.50%      30.691us         1.50%      30.691us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.22%     209.265us        10.22%     209.265us      23.252us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.93%      19.072us         1.20%      24.640us       2.738us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.247us         0.45%       9.247us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      11.270us         0.55%      11.270us       3.757us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%      10.520us         0.51%      10.520us       3.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.931us         0.35%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.351us       179.68%     336.351us     336.351us             1  
+                                            torch_eager         5.85%     142.571us        99.79%       2.430ms       2.430ms       0.000us         0.00%     197.311us     197.311us             1  
+                                           aten::conv1d         0.28%       6.741us         4.71%     114.731us      38.244us       0.000us         0.00%     134.368us      44.789us             3  
+                                      aten::convolution         0.38%       9.350us         4.43%     107.990us      35.997us       0.000us         0.00%     134.368us      44.789us             3  
+                                     aten::_convolution         0.88%      21.488us         4.05%      98.640us      32.880us       0.000us         0.00%     134.368us      44.789us             3  
+                                aten::_conv_depthwise2d         0.83%      20.301us         2.51%      61.091us      20.364us     134.368us        71.78%     134.368us      44.789us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     134.368us        71.78%     134.368us      44.789us             3  
+                                               aten::to         0.26%       6.379us        88.22%       2.148ms     358.072us       0.000us         0.00%      62.943us      10.491us             6  
+                                         aten::_to_copy         0.93%      22.632us        87.96%       2.142ms     357.009us       0.000us         0.00%      62.943us      10.491us             6  
+                                            aten::copy_         2.03%      49.489us        85.76%       2.089ms     348.110us      52.831us        28.22%      62.943us      10.491us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.727us        15.88%      29.727us       9.909us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.104us        12.34%      23.104us       7.701us             3  
+                                Activity Buffer Request        76.11%       1.853ms        76.11%       1.853ms       1.853ms      10.112us         5.40%      10.112us      10.112us             1  
+                                    aten::empty_strided         1.26%      30.760us         1.26%      30.760us       5.127us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.55%     208.274us         8.55%     208.274us      23.142us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.71%      17.184us         0.91%      22.223us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.34%       8.338us         0.34%       8.338us       0.556us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.38%       9.180us         0.38%       9.180us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       9.020us         0.37%       9.020us       3.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.460us         0.27%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.048ms
-Self CUDA time total: 185.983us
+Self CPU time total: 2.435ms
+Self CUDA time total: 187.199us
 
 
 
@@ -4889,29 +4671,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.235us       170.21%     358.235us     358.235us             1  
-                                            torch_eager        15.50%     124.275us        99.34%     796.461us     796.461us       0.000us         0.00%     224.253us     224.253us             1  
-                                           aten::conv1d         0.70%       5.590us        14.78%     118.483us      39.494us       0.000us         0.00%     154.174us      51.391us             3  
-                                      aten::convolution         1.24%       9.921us        14.08%     112.893us      37.631us       0.000us         0.00%     154.174us      51.391us             3  
-                                     aten::_convolution         2.81%      22.549us        12.84%     102.972us      34.324us       0.000us         0.00%     154.174us      51.391us             3  
-                                aten::_conv_depthwise2d         2.82%      22.632us         8.11%      65.062us      21.687us     154.174us        73.26%     154.174us      51.391us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.174us        73.26%     154.174us      51.391us             3  
-                                               aten::to         0.74%       5.971us        65.46%     524.833us      87.472us       0.000us         0.00%      70.079us      11.680us             6  
-                                         aten::_to_copy         3.23%      25.880us        64.72%     518.862us      86.477us       0.000us         0.00%      70.079us      11.680us             6  
-                                            aten::copy_         6.33%      50.713us        57.67%     462.401us      77.067us      56.287us        26.74%      70.079us      11.680us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      33.248us        15.80%      33.248us      11.083us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.039us        10.95%      23.039us       7.680us             3  
-                                Activity Buffer Request        28.19%     225.995us        28.19%     225.995us     225.995us      13.792us         6.55%      13.792us      13.792us             1  
-                                    aten::empty_strided         3.81%      30.581us         3.81%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.98%     208.263us        25.98%     208.263us      23.140us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.24%      17.992us         2.91%      23.301us       2.589us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.16%       9.309us         1.16%       9.309us       0.621us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.31%      10.480us         1.31%      10.480us       3.493us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       9.380us         1.17%       9.380us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.74%       5.910us         0.92%       7.370us       2.457us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.323us       159.21%     335.323us     335.323us             1  
+                                            torch_eager        14.44%     115.471us        99.40%     794.842us     794.842us       0.000us         0.00%     223.709us     223.709us             1  
+                                           aten::conv1d         0.70%       5.561us        13.80%     110.362us      36.787us       0.000us         0.00%     154.845us      51.615us             3  
+                                      aten::convolution         1.15%       9.189us        13.11%     104.801us      34.934us       0.000us         0.00%     154.845us      51.615us             3  
+                                     aten::_convolution         2.52%      20.182us        11.96%      95.612us      31.871us       0.000us         0.00%     154.845us      51.615us             3  
+                                aten::_conv_depthwise2d         2.51%      20.101us         7.60%      60.741us      20.247us     154.845us        73.52%     154.845us      51.615us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.845us        73.52%     154.845us      51.615us             3  
+                                               aten::to         0.72%       5.750us        68.18%     545.179us      90.863us       0.000us         0.00%      68.864us      11.477us             6  
+                                         aten::_to_copy         2.77%      22.130us        67.46%     539.429us      89.905us       0.000us         0.00%      68.864us      11.477us             6  
+                                            aten::copy_         5.86%      46.830us        60.79%     486.078us      81.013us      55.776us        26.48%      68.864us      11.477us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.416us        15.39%      32.416us      10.805us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.360us        11.09%      23.360us       7.787us             3  
+                                Activity Buffer Request        31.66%     253.204us        31.66%     253.204us     253.204us      13.088us         6.21%      13.088us      13.088us             1  
+                                    aten::empty_strided         3.90%      31.221us         3.90%      31.221us       5.203us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.02%     208.054us        26.02%     208.054us      23.117us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.93%      15.399us         2.47%      19.760us       2.196us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.98%       7.800us         0.98%       7.800us       0.520us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.810us         1.23%       9.810us       3.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.10%       8.820us         1.10%       8.820us       2.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.519us         0.86%       6.899us       2.300us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 801.751us
-Self CUDA time total: 210.461us
+Self CPU time total: 799.662us
+Self CUDA time total: 210.621us
 
 
 
@@ -4921,29 +4703,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         7.15%     131.473us        52.77%     970.085us     970.085us       0.000us         0.00%       1.521ms       1.521ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.40%       1.421ms       1.421ms             1  
-                                               aten::to         0.36%       6.571us        37.17%     683.219us     113.870us       0.000us         0.00%     824.180us     137.363us             6  
-                                         aten::_to_copy         1.61%      29.612us        36.81%     676.648us     112.775us       0.000us         0.00%     824.180us     137.363us             6  
-                                            aten::copy_         2.81%      51.569us        25.14%     462.051us      77.009us     718.613us        50.76%     824.180us     137.363us             6  
-                                           aten::conv1d         0.36%       6.680us         6.82%     125.423us      41.808us       0.000us         0.00%     696.981us     232.327us             3  
-                                      aten::convolution         0.57%      10.460us         6.46%     118.743us      39.581us       0.000us         0.00%     696.981us     232.327us             3  
-                                     aten::_convolution         1.31%      24.040us         5.89%     108.283us      36.094us       0.000us         0.00%     696.981us     232.327us             3  
-                                aten::_conv_depthwise2d         1.25%      22.981us         3.69%      67.913us      22.638us     696.981us        49.24%     696.981us     232.327us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.981us        49.24%     696.981us     232.327us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     410.458us        29.00%     410.458us     136.819us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     308.155us        21.77%     308.155us     102.718us             3  
-                                Activity Buffer Request        11.91%     218.936us        11.91%     218.936us     218.936us     105.567us         7.46%     105.567us     105.567us             1  
-                                    aten::empty_strided         2.01%      37.011us        10.06%     184.985us      30.831us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.74%     215.777us        11.74%     215.777us      23.975us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.99%      18.200us         1.31%      24.000us       2.667us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.53%       9.740us         0.53%       9.740us       0.649us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.59%      10.839us         0.59%      10.839us       3.613us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.54%       9.862us         0.54%       9.862us       3.287us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.240us         0.42%       7.700us       2.567us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.62%     120.362us        52.56%     956.135us     956.135us       0.000us         0.00%       1.509ms       1.509ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.411ms       100.41%       1.411ms       1.411ms             1  
+                                               aten::to         0.34%       6.140us        38.13%     693.750us     115.625us       0.000us         0.00%     815.515us     135.919us             6  
+                                         aten::_to_copy         1.53%      27.810us        37.80%     687.610us     114.602us       0.000us         0.00%     815.515us     135.919us             6  
+                                            aten::copy_         2.83%      51.570us        25.68%     467.247us      77.874us     711.740us        50.66%     815.515us     135.919us             6  
+                                           aten::conv1d         0.32%       5.781us         6.36%     115.702us      38.567us       0.000us         0.00%     693.278us     231.093us             3  
+                                      aten::convolution         0.51%       9.289us         6.04%     109.921us      36.640us       0.000us         0.00%     693.278us     231.093us             3  
+                                     aten::_convolution         1.19%      21.630us         5.53%     100.632us      33.544us       0.000us         0.00%     693.278us     231.093us             3  
+                                aten::_conv_depthwise2d         1.16%      21.108us         3.52%      63.951us      21.317us     693.278us        49.34%     693.278us     231.093us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     693.278us        49.34%     693.278us     231.093us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     405.439us        28.86%     405.439us     135.146us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.301us        21.80%     306.301us     102.100us             3  
+                                Activity Buffer Request        12.14%     220.924us        12.14%     220.924us     220.924us     103.775us         7.39%     103.775us     103.775us             1  
+                                    aten::empty_strided         1.98%      36.051us        10.58%     192.553us      32.092us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.05%     219.204us        12.05%     219.204us      24.356us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      16.940us         1.22%      22.200us       2.467us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       8.651us         0.48%       8.651us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%       9.201us         0.51%       9.201us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.191us         0.51%       9.191us       3.064us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.621us         0.38%       6.871us       2.290us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.838ms
-Self CUDA time total: 1.416ms
+Self CPU time total: 1.819ms
+Self CUDA time total: 1.405ms
 
 
 
@@ -4953,56 +4735,56 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.74%     124.615us        43.66%     806.720us     806.720us       0.000us         0.00%       1.502ms       1.502ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.433ms       100.41%       1.433ms       1.433ms             1  
-                                               aten::to         0.34%       6.269us        28.35%     523.751us      87.292us       0.000us         0.00%     764.786us     127.464us             6  
-                                         aten::_to_copy         1.27%      23.480us        28.01%     517.482us      86.247us       0.000us         0.00%     764.786us     127.464us             6  
-                                            aten::copy_         2.74%      50.661us        25.15%     464.712us      77.452us     690.099us        48.36%     764.786us     127.464us             6  
-                                           aten::conv1d         0.32%       5.870us         7.00%     129.374us      43.125us       0.000us         0.00%     737.040us     245.680us             3  
-                                      aten::convolution         0.54%       9.999us         6.68%     123.504us      41.168us       0.000us         0.00%     737.040us     245.680us             3  
-                                     aten::_convolution         1.31%      24.293us         6.14%     113.505us      37.835us       0.000us         0.00%     737.040us     245.680us             3  
-                                aten::_conv_depthwise2d         1.62%      30.010us         3.95%      73.060us      24.353us     737.040us        51.64%     737.040us     245.680us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.040us        51.64%     737.040us     245.680us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     399.673us        28.01%     399.673us     133.224us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     290.426us        20.35%     290.426us      96.809us             3  
-                                Activity Buffer Request        12.15%     224.466us        12.15%     224.466us     224.466us      74.687us         5.23%      74.687us      74.687us             1  
-                                    aten::empty_strided         1.59%      29.290us         1.59%      29.290us       4.882us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.52%     212.785us        11.52%     212.785us      23.643us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.94%      17.281us         1.23%      22.771us       2.530us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.55%      10.081us         0.55%      10.081us       0.672us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.57%      10.440us         0.57%      10.440us       3.480us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%       9.410us         0.51%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       6.150us         0.41%       7.641us       2.547us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.07%     112.213us        42.26%     781.792us     781.792us       0.000us         0.00%       1.498ms       1.498ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.428ms       100.39%       1.428ms       1.428ms             1  
+                                               aten::to         0.33%       6.130us        28.74%     531.749us      88.625us       0.000us         0.00%     757.569us     126.261us             6  
+                                         aten::_to_copy         1.23%      22.780us        28.41%     525.619us      87.603us       0.000us         0.00%     757.569us     126.261us             6  
+                                            aten::copy_         2.64%      48.852us        25.56%     472.969us      78.828us     682.049us        47.95%     757.569us     126.261us             6  
+                                           aten::conv1d         0.33%       6.130us         6.13%     113.361us      37.787us       0.000us         0.00%     740.449us     246.816us             3  
+                                      aten::convolution         0.48%       8.889us         5.80%     107.231us      35.744us       0.000us         0.00%     740.449us     246.816us             3  
+                                     aten::_convolution         1.13%      20.931us         5.32%      98.342us      32.781us       0.000us         0.00%     740.449us     246.816us             3  
+                                aten::_conv_depthwise2d         1.15%      21.330us         3.38%      62.491us      20.830us     740.449us        52.05%     740.449us     246.816us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     740.449us        52.05%     740.449us     246.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.857us        27.97%     397.857us     132.619us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     284.192us        19.98%     284.192us      94.731us             3  
+                                Activity Buffer Request        12.95%     239.644us        12.95%     239.644us     239.644us      75.520us         5.31%      75.520us      75.520us             1  
+                                    aten::empty_strided         1.61%      29.870us         1.61%      29.870us       4.978us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.17%     206.574us        11.17%     206.574us      22.953us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.85%      15.779us         1.12%      20.809us       2.312us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.409us         0.45%       8.409us       0.561us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%       9.120us         0.49%       9.120us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.54%       9.940us         0.54%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.381us         0.36%       6.700us       2.233us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.848ms
-Self CUDA time total: 1.427ms
+Self CPU time total: 1.850ms
+Self CUDA time total: 1.422ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B2_D2048_S128_W2     0.09  True
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
 torch_eager              cuda_B2_D2048_S128_W4     0.08  True
-torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.16  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
-torch_eager              cuda_B2_D2048_S512_W2     0.09  True
-torch_eager              cuda_B2_D2048_S512_W4     0.09  True
+torch_eager              cuda_B2_D2048_S512_W2     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.08  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
-torch_eager              cuda_B2_D64_S128_W4     0.09  True
-torch_eager              cuda_B2_D64_S2048_W2     0.09  True
-torch_eager              cuda_B2_D64_S2048_W4     0.09  True
-torch_eager              cuda_B2_D64_S512_W2     0.09  True
-torch_eager              cuda_B2_D64_S512_W4     0.09  True
-torch_eager              cuda_B4_D2048_S128_W2     0.09  True
-torch_eager              cuda_B4_D2048_S128_W4     0.09  True
-torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
+torch_eager              cuda_B2_D64_S128_W4     0.08  True
+torch_eager              cuda_B2_D64_S2048_W2     0.08  True
+torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S512_W2     0.08  True
+torch_eager              cuda_B2_D64_S512_W4     0.08  True
+torch_eager              cuda_B4_D2048_S128_W2     0.08  True
+torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.48  True
 torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
-torch_eager              cuda_B4_D2048_S512_W2     0.10  True
+torch_eager              cuda_B4_D2048_S512_W2     0.09  True
 torch_eager              cuda_B4_D2048_S512_W4     0.10  True
-torch_eager              cuda_B4_D64_S128_W2     0.09  True
+torch_eager              cuda_B4_D64_S128_W2     0.08  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
-torch_eager              cuda_B4_D64_S2048_W2     0.09  True
-torch_eager              cuda_B4_D64_S2048_W4     0.09  True
-torch_eager              cuda_B4_D64_S512_W2     0.09  True
-torch_eager              cuda_B4_D64_S512_W4     0.09  True
+torch_eager              cuda_B4_D64_S2048_W2     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.08  True
+torch_eager              cuda_B4_D64_S512_W2     0.08  True
+torch_eager              cuda_B4_D64_S512_W4     0.08  True
 

Artifacts: