diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:00:25 2025 +Mon Nov 10 21:57:49 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 33C P0 79W / 350W | 0MiB / 46068MiB | 11% Default | +| N/A 27C P0 77W / 350W | 0MiB / 46068MiB | 18% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4155,7 +3937,7 @@ Cell: nv | 0.21s ▼ output ▶ uv-logs | -Cell: benchmark | 3.68s +Cell: benchmark | 3.89s | Raw @@ -4217,29 +3999,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 439.324us 2269.12% 439.324us 439.324us 1 - torch_eager 10.31% 220.478us 99.69% 2.131ms 2.131ms 0.000us 0.00% 21.729us 21.729us 1 - aten::to 0.50% 10.770us 79.87% 1.707ms 284.530us 0.000us 0.00% 14.369us 2.395us 6 - aten::_to_copy 1.71% 36.499us 79.36% 1.696ms 282.735us 0.000us 0.00% 14.369us 2.395us 6 - aten::copy_ 2.77% 59.234us 75.21% 1.608ms 267.930us 12.001us 61.99% 14.369us 2.395us 6 - aten::conv1d 0.36% 7.590us 7.34% 156.883us 52.294us 0.000us 0.00% 7.360us 2.453us 3 - aten::convolution 0.66% 14.070us 6.98% 149.293us 49.764us 0.000us 0.00% 7.360us 2.453us 3 - aten::_convolution 1.51% 32.210us 6.33% 135.223us 45.074us 0.000us 0.00% 7.360us 2.453us 3 - aten::_conv_depthwise2d 1.61% 34.371us 4.00% 85.463us 28.488us 7.360us 38.01% 7.360us 2.453us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.01% 7.360us 2.453us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 32.73% 6.337us 2.112us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.25% 5.664us 1.888us 3 - Activity Buffer Request 69.37% 1.483ms 69.37% 1.483ms 1.483ms 2.368us 12.23% 2.368us 2.368us 1 - aten::empty_strided 2.45% 52.331us 2.45% 52.331us 8.722us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 4.26% 91.032us 4.26% 91.032us 10.115us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.32% 28.311us 1.71% 36.491us 4.055us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.64% 13.700us 0.64% 13.700us 0.913us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.60% 12.790us 0.60% 12.790us 4.263us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.59% 12.710us 0.59% 12.710us 4.237us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.31% 6.640us 0.38% 8.090us 2.697us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 411.136us 2127.15% 411.136us 411.136us 1 + torch_eager 8.60% 205.173us 99.40% 2.372ms 2.372ms 0.000us 0.00% 21.632us 21.632us 1 + aten::to 0.40% 9.649us 83.06% 1.982ms 330.358us 0.000us 0.00% 14.272us 2.379us 6 + aten::_to_copy 1.47% 35.141us 82.65% 1.973ms 328.750us 0.000us 0.00% 14.272us 2.379us 6 + aten::copy_ 2.42% 57.830us 79.13% 1.889ms 314.753us 11.968us 61.92% 14.272us 2.379us 6 + aten::conv1d 0.32% 7.640us 6.22% 148.384us 49.461us 0.000us 0.00% 7.360us 2.453us 3 + aten::convolution 0.55% 13.222us 5.90% 140.744us 46.915us 0.000us 0.00% 7.360us 2.453us 3 + aten::_convolution 1.23% 29.427us 5.34% 127.522us 42.507us 0.000us 0.00% 7.360us 2.453us 3 + aten::_conv_depthwise2d 1.41% 33.690us 3.44% 82.073us 27.358us 7.360us 38.08% 7.360us 2.453us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.08% 7.360us 2.453us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.62% 6.304us 2.101us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.30% 5.664us 1.888us 3 + Activity Buffer Request 73.85% 1.762ms 73.85% 1.762ms 1.762ms 2.304us 11.92% 2.304us 2.304us 1 + aten::empty_strided 2.05% 48.841us 2.05% 48.841us 8.140us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.88% 92.484us 3.88% 92.484us 10.276us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.94% 22.551us 1.23% 29.352us 3.261us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.46% 10.991us 0.46% 10.991us 0.733us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.53% 12.660us 0.53% 12.660us 4.220us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.49% 11.631us 0.49% 11.631us 3.877us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.27% 6.340us 0.32% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.138ms -Self CUDA time total: 19.361us +Self CPU time total: 2.386ms +Self CUDA time total: 19.328us @@ -4249,29 +4031,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.789us 1742.49% 341.789us 341.789us 1 - torch_eager 7.86% 151.082us 99.71% 1.916ms 1.916ms 0.000us 0.00% 21.695us 21.695us 1 - aten::to 0.35% 6.661us 83.96% 1.614ms 268.966us 0.000us 0.00% 13.695us 2.282us 6 - aten::_to_copy 1.29% 24.781us 83.61% 1.607ms 267.856us 0.000us 0.00% 13.695us 2.282us 6 - aten::copy_ 2.59% 49.784us 80.72% 1.552ms 258.589us 11.615us 59.21% 13.695us 2.282us 6 - aten::conv1d 0.32% 6.220us 6.35% 122.113us 40.704us 0.000us 0.00% 8.000us 2.667us 3 - aten::convolution 0.53% 10.120us 6.03% 115.893us 38.631us 0.000us 0.00% 8.000us 2.667us 3 - aten::_convolution 1.20% 23.080us 5.50% 105.773us 35.258us 0.000us 0.00% 8.000us 2.667us 3 - aten::_conv_depthwise2d 1.19% 22.952us 3.39% 65.123us 21.708us 8.000us 40.79% 8.000us 2.667us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 40.79% 8.000us 2.667us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 30.83% 6.047us 2.016us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.39% 5.568us 1.856us 3 - Activity Buffer Request 75.54% 1.452ms 75.54% 1.452ms 1.452ms 2.080us 10.60% 2.080us 2.080us 1 - aten::empty_strided 1.60% 30.820us 1.60% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.74% 71.953us 3.74% 71.953us 7.995us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.98% 18.881us 1.29% 24.750us 2.750us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.50% 9.609us 0.50% 9.609us 0.641us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.56% 10.750us 0.56% 10.750us 3.583us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.49% 9.339us 0.49% 9.339us 3.113us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.34% 6.630us 0.42% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.094us 1629.14% 320.094us 320.094us 1 + torch_eager 6.61% 147.267us 99.75% 2.222ms 2.222ms 0.000us 0.00% 21.856us 21.856us 1 + aten::to 0.28% 6.328us 86.86% 1.935ms 322.525us 0.000us 0.00% 13.888us 2.315us 6 + aten::_to_copy 0.99% 22.058us 86.58% 1.929ms 321.470us 0.000us 0.00% 13.888us 2.315us 6 + aten::copy_ 2.09% 46.581us 84.13% 1.874ms 312.384us 11.680us 59.45% 13.888us 2.315us 6 + aten::conv1d 0.26% 5.880us 5.20% 115.901us 38.634us 0.000us 0.00% 7.968us 2.656us 3 + aten::convolution 0.41% 9.201us 4.94% 110.021us 36.674us 0.000us 0.00% 7.968us 2.656us 3 + aten::_convolution 0.99% 22.029us 4.53% 100.820us 33.607us 0.000us 0.00% 7.968us 2.656us 3 + aten::_conv_depthwise2d 0.98% 21.809us 2.84% 63.210us 21.070us 7.968us 40.55% 7.968us 2.656us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 40.55% 7.968us 2.656us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 31.11% 6.112us 2.037us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.34% 5.568us 1.856us 3 + Activity Buffer Request 79.89% 1.780ms 79.89% 1.780ms 1.780ms 2.208us 11.24% 2.208us 2.208us 1 + aten::empty_strided 1.46% 32.461us 1.46% 32.461us 5.410us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.22% 71.802us 3.22% 71.802us 7.978us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.71% 15.809us 0.93% 20.750us 2.306us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.492us 0.38% 8.492us 0.566us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.41% 9.081us 0.41% 9.081us 3.027us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.38% 8.530us 0.38% 8.530us 2.843us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.26% 5.730us 0.32% 7.140us 2.380us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.922ms -Self CUDA time total: 19.615us +Self CPU time total: 2.228ms +Self CUDA time total: 19.648us @@ -4281,29 +4063,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.328us 1837.45% 343.328us 343.328us 1 - torch_eager 7.88% 151.015us 99.69% 1.911ms 1.911ms 0.000us 0.00% 20.605us 20.605us 1 - aten::to 0.33% 6.409us 84.02% 1.611ms 268.468us 0.000us 0.00% 13.662us 2.277us 6 - aten::_to_copy 1.32% 25.354us 83.68% 1.604ms 267.400us 0.000us 0.00% 13.662us 2.277us 6 - aten::copy_ 2.65% 50.770us 80.80% 1.549ms 258.170us 11.742us 62.84% 13.662us 2.277us 6 - aten::conv1d 0.33% 6.290us 6.34% 121.483us 40.494us 0.000us 0.00% 6.943us 2.314us 3 - aten::convolution 0.54% 10.430us 6.01% 115.193us 38.398us 0.000us 0.00% 6.943us 2.314us 3 - aten::_convolution 1.17% 22.439us 5.46% 104.763us 34.921us 0.000us 0.00% 6.943us 2.314us 3 - aten::_conv_depthwise2d 1.17% 22.412us 3.43% 65.843us 21.948us 6.943us 37.16% 6.943us 2.314us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.943us 37.16% 6.943us 2.314us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.982us 32.01% 5.982us 1.994us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.83% 5.760us 1.920us 3 - Activity Buffer Request 75.50% 1.448ms 75.50% 1.448ms 1.448ms 1.920us 10.28% 1.920us 1.920us 1 - aten::empty_strided 1.57% 30.029us 1.57% 30.029us 5.005us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.90% 74.680us 3.90% 74.680us 8.298us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.93% 17.782us 1.21% 23.252us 2.584us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.48% 9.281us 0.48% 9.281us 0.619us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.57% 10.910us 0.57% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.44% 8.531us 0.44% 8.531us 2.844us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 6.170us 0.39% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.750us 1724.09% 322.750us 322.750us 1 + torch_eager 6.97% 154.353us 99.74% 2.208ms 2.208ms 0.000us 0.00% 20.736us 20.736us 1 + aten::to 0.30% 6.580us 86.44% 1.913ms 318.849us 0.000us 0.00% 13.791us 2.299us 6 + aten::_to_copy 1.09% 24.161us 86.14% 1.907ms 317.752us 0.000us 0.00% 13.791us 2.299us 6 + aten::copy_ 2.12% 46.909us 83.64% 1.851ms 308.533us 11.775us 62.90% 13.791us 2.299us 6 + aten::conv1d 0.30% 6.591us 5.18% 114.662us 38.221us 0.000us 0.00% 6.945us 2.315us 3 + aten::convolution 0.40% 8.811us 4.88% 108.071us 36.024us 0.000us 0.00% 6.945us 2.315us 3 + aten::_convolution 0.96% 21.188us 4.48% 99.260us 33.087us 0.000us 0.00% 6.945us 2.315us 3 + aten::_conv_depthwise2d 0.97% 21.520us 2.82% 62.461us 20.820us 6.945us 37.10% 6.945us 2.315us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.945us 37.10% 6.945us 2.315us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 32.30% 6.047us 2.016us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 30.60% 5.728us 1.909us 3 + Activity Buffer Request 79.41% 1.758ms 79.41% 1.758ms 1.758ms 2.016us 10.77% 2.016us 2.016us 1 + aten::empty_strided 1.41% 31.151us 1.41% 31.151us 5.192us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.17% 70.153us 3.17% 70.153us 7.795us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.77% 17.060us 1.01% 22.310us 2.479us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.39% 8.641us 0.39% 8.641us 0.576us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.42% 9.380us 0.42% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.37% 8.090us 0.37% 8.090us 2.697us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.25% 5.450us 0.31% 6.801us 2.267us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.917ms -Self CUDA time total: 18.685us +Self CPU time total: 2.213ms +Self CUDA time total: 18.720us @@ -4313,29 +4095,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.280us 1734.88% 340.280us 340.280us 1 - torch_eager 6.89% 141.563us 99.72% 2.049ms 2.049ms 0.000us 0.00% 21.726us 21.726us 1 - aten::to 0.30% 6.132us 85.38% 1.755ms 292.424us 0.000us 0.00% 13.982us 2.330us 6 - aten::_to_copy 1.19% 24.439us 85.08% 1.748ms 291.402us 0.000us 0.00% 13.982us 2.330us 6 - aten::copy_ 2.50% 51.302us 82.39% 1.693ms 282.182us 11.870us 60.52% 13.982us 2.330us 6 - aten::conv1d 0.29% 5.930us 5.97% 122.723us 40.908us 0.000us 0.00% 7.744us 2.581us 3 - aten::convolution 0.50% 10.300us 5.68% 116.793us 38.931us 0.000us 0.00% 7.744us 2.581us 3 - aten::_convolution 1.17% 23.960us 5.18% 106.493us 35.498us 0.000us 0.00% 7.744us 2.581us 3 - aten::_conv_depthwise2d 1.08% 22.141us 3.19% 65.452us 21.817us 7.744us 39.48% 7.744us 2.581us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 39.48% 7.744us 2.581us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.143us 31.32% 6.143us 2.048us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 29.20% 5.727us 1.909us 3 - Activity Buffer Request 70.00% 1.438ms 70.00% 1.438ms 1.438ms 2.112us 10.77% 2.112us 2.112us 1 - aten::empty_strided 1.50% 30.881us 1.50% 30.881us 5.147us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.01% 226.194us 11.01% 226.194us 25.133us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 18.302us 1.19% 24.432us 2.715us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.49% 9.981us 0.49% 9.981us 0.665us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 11.260us 0.55% 11.260us 3.753us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.45% 9.171us 0.45% 9.171us 3.057us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 6.620us 0.39% 8.030us 2.677us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.254us 1673.31% 328.254us 328.254us 1 + torch_eager 6.02% 146.742us 99.79% 2.431ms 2.431ms 0.000us 0.00% 21.729us 21.729us 1 + aten::to 0.25% 6.201us 87.89% 2.141ms 356.794us 0.000us 0.00% 14.048us 2.341us 6 + aten::_to_copy 0.95% 23.051us 87.64% 2.135ms 355.761us 0.000us 0.00% 14.048us 2.341us 6 + aten::copy_ 1.93% 46.899us 85.39% 2.080ms 346.662us 11.936us 60.85% 14.048us 2.341us 6 + aten::conv1d 0.28% 6.941us 4.83% 117.552us 39.184us 0.000us 0.00% 7.681us 2.560us 3 + aten::convolution 0.38% 9.320us 4.54% 110.611us 36.870us 0.000us 0.00% 7.681us 2.560us 3 + aten::_convolution 0.86% 20.861us 4.16% 101.291us 33.764us 0.000us 0.00% 7.681us 2.560us 3 + aten::_conv_depthwise2d 0.93% 22.752us 2.67% 64.991us 21.664us 7.681us 39.15% 7.681us 2.560us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.681us 39.15% 7.681us 2.560us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 31.65% 6.208us 2.069us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.20% 5.728us 1.909us 3 + Activity Buffer Request 75.50% 1.839ms 75.50% 1.839ms 1.839ms 2.112us 10.77% 2.112us 2.112us 1 + aten::empty_strided 1.29% 31.540us 1.29% 31.540us 5.257us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.87% 216.103us 8.87% 216.103us 24.011us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.70% 16.989us 0.90% 21.970us 2.441us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.35% 8.601us 0.35% 8.601us 0.573us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.43% 10.359us 0.43% 10.359us 3.453us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.40% 9.840us 0.40% 9.840us 3.280us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.410us 0.28% 6.920us 2.307us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.055ms -Self CUDA time total: 19.614us +Self CPU time total: 2.436ms +Self CUDA time total: 19.617us @@ -4345,29 +4127,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.964us 1548.03% 379.964us 379.964us 1 - torch_eager 7.69% 160.944us 99.76% 2.089ms 2.089ms 0.000us 0.00% 26.817us 26.817us 1 - aten::to 0.33% 7.000us 83.76% 1.754ms 292.349us 0.000us 0.00% 15.265us 2.544us 6 - aten::_to_copy 1.23% 25.779us 83.43% 1.747ms 291.183us 0.000us 0.00% 15.265us 2.544us 6 - aten::copy_ 2.49% 52.100us 80.65% 1.689ms 281.484us 12.993us 52.94% 15.265us 2.544us 6 - aten::conv1d 0.31% 6.410us 6.85% 143.364us 47.788us 0.000us 0.00% 11.552us 3.851us 3 - aten::convolution 1.48% 31.021us 6.54% 136.954us 45.651us 0.000us 0.00% 11.552us 3.851us 3 - aten::_convolution 1.13% 23.621us 5.06% 105.933us 35.311us 0.000us 0.00% 11.552us 3.851us 3 - aten::_conv_depthwise2d 1.06% 22.209us 3.13% 65.632us 21.877us 11.552us 47.06% 11.552us 3.851us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.06% 11.552us 3.851us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.625us 26.99% 6.625us 2.208us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 25.94% 6.368us 2.123us 3 - Activity Buffer Request 68.76% 1.440ms 68.76% 1.440ms 1.440ms 2.272us 9.26% 2.272us 2.272us 1 - aten::empty_strided 1.55% 32.413us 1.55% 32.413us 5.402us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.50% 219.817us 10.50% 219.817us 24.424us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.87% 18.301us 1.15% 24.061us 2.673us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.50% 10.530us 0.50% 10.530us 0.702us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.50% 10.490us 0.50% 10.490us 3.497us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.872us 0.47% 9.872us 3.291us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.30% 6.220us 0.37% 7.740us 2.580us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.374us 1318.69% 325.374us 325.374us 1 + torch_eager 6.23% 145.210us 99.78% 2.326ms 2.326ms 0.000us 0.00% 26.978us 26.978us 1 + aten::to 0.28% 6.471us 87.58% 2.041ms 340.232us 0.000us 0.00% 15.298us 2.550us 6 + aten::_to_copy 1.01% 23.559us 87.30% 2.035ms 339.154us 0.000us 0.00% 15.298us 2.550us 6 + aten::copy_ 2.04% 47.563us 85.03% 1.982ms 330.320us 12.994us 52.66% 15.298us 2.550us 6 + aten::conv1d 0.26% 6.060us 4.91% 114.341us 38.114us 0.000us 0.00% 11.680us 3.893us 3 + aten::convolution 0.40% 9.250us 4.65% 108.281us 36.094us 0.000us 0.00% 11.680us 3.893us 3 + aten::_convolution 0.89% 20.669us 4.25% 99.031us 33.010us 0.000us 0.00% 11.680us 3.893us 3 + aten::_conv_depthwise2d 0.95% 22.039us 2.73% 63.550us 21.183us 11.680us 47.34% 11.680us 3.893us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.680us 47.34% 11.680us 3.893us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.657us 26.98% 6.657us 2.219us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 25.68% 6.337us 2.112us 3 + Activity Buffer Request 74.59% 1.739ms 74.59% 1.739ms 1.739ms 2.304us 9.34% 2.304us 2.304us 1 + aten::empty_strided 1.26% 29.442us 1.26% 29.442us 4.907us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.39% 218.802us 9.39% 218.802us 24.311us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.69% 16.041us 0.91% 21.173us 2.353us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.37% 8.602us 0.37% 8.602us 0.573us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.40% 9.341us 0.40% 9.341us 3.114us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.39% 8.990us 0.39% 8.990us 2.997us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.290us 0.28% 6.580us 2.193us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.094ms -Self CUDA time total: 24.545us +Self CPU time total: 2.331ms +Self CUDA time total: 24.674us @@ -4377,29 +4159,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 351.133us 1341.43% 351.133us 351.133us 1 - torch_eager 7.55% 157.812us 99.73% 2.084ms 2.084ms 0.000us 0.00% 28.416us 28.416us 1 - aten::to 0.31% 6.571us 84.80% 1.772ms 295.318us 0.000us 0.00% 15.264us 2.544us 6 - aten::_to_copy 1.22% 25.450us 84.49% 1.765ms 294.223us 0.000us 0.00% 15.264us 2.544us 6 - aten::copy_ 2.31% 48.301us 81.82% 1.710ms 284.947us 13.024us 49.76% 15.264us 2.544us 6 - aten::conv1d 0.32% 6.640us 5.96% 124.543us 41.514us 0.000us 0.00% 13.152us 4.384us 3 - aten::convolution 0.50% 10.360us 5.64% 117.903us 39.301us 0.000us 0.00% 13.152us 4.384us 3 - aten::_convolution 1.16% 24.330us 5.15% 107.543us 35.848us 0.000us 0.00% 13.152us 4.384us 3 - aten::_conv_depthwise2d 1.06% 22.241us 3.14% 65.623us 21.874us 13.152us 50.24% 13.152us 4.384us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.152us 50.24% 13.152us 4.384us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 25.43% 6.656us 2.219us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.33% 6.368us 2.123us 3 - Activity Buffer Request 70.10% 1.465ms 70.10% 1.465ms 1.465ms 2.240us 8.56% 2.240us 2.240us 1 - aten::empty_strided 1.45% 30.202us 1.45% 30.202us 5.034us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.51% 219.677us 10.51% 219.677us 24.409us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.90% 18.881us 1.17% 24.421us 2.713us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.46% 9.580us 0.46% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 11.471us 0.55% 11.471us 3.824us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.43% 8.890us 0.43% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.33% 6.950us 0.40% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.853us 1241.91% 325.853us 325.853us 1 + torch_eager 6.02% 142.382us 99.78% 2.359ms 2.359ms 0.000us 0.00% 28.510us 28.510us 1 + aten::to 0.27% 6.279us 87.80% 2.076ms 345.959us 0.000us 0.00% 15.262us 2.544us 6 + aten::_to_copy 0.97% 22.980us 87.54% 2.069ms 344.912us 0.000us 0.00% 15.262us 2.544us 6 + aten::copy_ 2.02% 47.672us 85.33% 2.017ms 336.189us 12.990us 49.51% 15.262us 2.544us 6 + aten::conv1d 0.27% 6.391us 4.88% 115.262us 38.421us 0.000us 0.00% 13.248us 4.416us 3 + aten::convolution 0.41% 9.629us 4.61% 108.871us 36.290us 0.000us 0.00% 13.248us 4.416us 3 + aten::_convolution 0.88% 20.800us 4.20% 99.242us 33.081us 0.000us 0.00% 13.248us 4.416us 3 + aten::_conv_depthwise2d 0.93% 21.882us 2.62% 62.041us 20.680us 13.248us 50.49% 13.248us 4.416us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.248us 50.49% 13.248us 4.416us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.622us 25.24% 6.622us 2.207us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.27% 6.368us 2.123us 3 + Activity Buffer Request 75.21% 1.778ms 75.21% 1.778ms 1.778ms 2.272us 8.66% 2.272us 2.272us 1 + aten::empty_strided 1.24% 29.361us 1.24% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.97% 212.032us 8.97% 212.032us 23.559us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.75% 17.821us 0.98% 23.130us 2.570us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.37% 8.699us 0.37% 8.699us 0.580us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.38% 9.090us 0.38% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 10.480us 0.44% 10.480us 3.493us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.24% 5.631us 0.30% 7.011us 2.337us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.089ms -Self CUDA time total: 26.176us +Self CPU time total: 2.364ms +Self CUDA time total: 26.238us @@ -4409,29 +4191,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.627us 908.24% 349.627us 349.627us 1 - torch_eager 7.45% 152.992us 99.76% 2.049ms 2.049ms 0.000us 0.00% 41.086us 41.086us 1 - aten::conv1d 0.32% 6.640us 6.06% 124.413us 41.471us 0.000us 0.00% 22.561us 7.520us 3 - aten::convolution 0.50% 10.370us 5.73% 117.773us 39.258us 0.000us 0.00% 22.561us 7.520us 3 - aten::_convolution 1.14% 23.411us 5.23% 107.403us 35.801us 0.000us 0.00% 22.561us 7.520us 3 - aten::_conv_depthwise2d 1.15% 23.650us 3.29% 67.532us 22.511us 22.561us 58.61% 22.561us 7.520us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.561us 58.61% 22.561us 7.520us 3 - aten::to 0.33% 6.780us 84.82% 1.743ms 290.446us 0.000us 0.00% 18.525us 3.087us 6 - aten::_to_copy 1.29% 26.502us 84.49% 1.736ms 289.316us 0.000us 0.00% 18.525us 3.087us 6 - aten::copy_ 2.40% 49.251us 81.74% 1.679ms 279.869us 15.934us 41.39% 18.525us 3.087us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.543us 22.19% 8.543us 2.848us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.20% 7.391us 2.464us 3 - Activity Buffer Request 69.84% 1.435ms 69.84% 1.435ms 1.435ms 2.591us 6.73% 2.591us 2.591us 1 - aten::empty_strided 1.47% 30.182us 1.47% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.64% 218.664us 10.64% 218.664us 24.296us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 18.281us 1.17% 24.011us 2.668us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.47% 9.739us 0.47% 9.739us 0.649us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.53% 10.991us 0.53% 10.991us 3.664us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 9.421us 0.46% 9.421us 3.140us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.970us 0.36% 7.320us 2.440us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 331.328us 858.50% 331.328us 331.328us 1 + torch_eager 5.97% 146.471us 99.79% 2.446ms 2.446ms 0.000us 0.00% 41.186us 41.186us 1 + aten::conv1d 0.25% 6.210us 4.77% 116.961us 38.987us 0.000us 0.00% 22.849us 7.616us 3 + aten::convolution 0.40% 9.740us 4.52% 110.751us 36.917us 0.000us 0.00% 22.849us 7.616us 3 + aten::_convolution 0.89% 21.911us 4.12% 101.011us 33.670us 0.000us 0.00% 22.849us 7.616us 3 + aten::_conv_depthwise2d 0.92% 22.550us 2.59% 63.530us 21.177us 22.849us 59.20% 22.849us 7.616us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.849us 59.20% 22.849us 7.616us 3 + aten::to 0.25% 6.228us 88.01% 2.158ms 359.617us 0.000us 0.00% 18.337us 3.056us 6 + aten::_to_copy 1.00% 24.602us 87.76% 2.151ms 358.579us 0.000us 0.00% 18.337us 3.056us 6 + aten::copy_ 1.98% 48.619us 85.49% 2.096ms 349.334us 15.745us 40.80% 18.337us 3.056us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.385us 21.73% 8.385us 2.795us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.07% 7.360us 2.453us 3 + Activity Buffer Request 75.73% 1.857ms 75.73% 1.857ms 1.857ms 2.592us 6.72% 2.592us 2.592us 1 + aten::empty_strided 1.26% 30.871us 1.26% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.69% 213.074us 8.69% 213.074us 23.675us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.69% 16.899us 0.91% 22.302us 2.478us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.35% 8.674us 0.35% 8.674us 0.578us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.39% 9.670us 0.39% 9.670us 3.223us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.37% 9.000us 0.37% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.570us 0.28% 6.790us 2.263us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.054ms -Self CUDA time total: 38.495us +Self CPU time total: 2.452ms +Self CUDA time total: 38.594us @@ -4441,29 +4223,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 345.054us 837.81% 345.054us 345.054us 1 - torch_eager 7.39% 151.695us 99.75% 2.049ms 2.049ms 0.000us 0.00% 43.810us 43.810us 1 - aten::conv1d 0.32% 6.620us 6.03% 123.883us 41.294us 0.000us 0.00% 25.375us 8.458us 3 - aten::convolution 0.50% 10.320us 5.71% 117.263us 39.088us 0.000us 0.00% 25.375us 8.458us 3 - aten::_convolution 1.20% 24.592us 5.21% 106.943us 35.648us 0.000us 0.00% 25.375us 8.458us 3 - aten::_conv_depthwise2d 1.13% 23.150us 3.19% 65.451us 21.817us 25.375us 61.61% 25.375us 8.458us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.375us 61.61% 25.375us 8.458us 3 - aten::to 0.31% 6.440us 84.93% 1.744ms 290.716us 0.000us 0.00% 18.435us 3.072us 6 - aten::_to_copy 1.24% 25.501us 84.61% 1.738ms 289.642us 0.000us 0.00% 18.435us 3.072us 6 - aten::copy_ 2.41% 49.431us 81.91% 1.682ms 280.380us 15.810us 38.39% 18.435us 3.072us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.386us 20.36% 8.386us 2.795us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 18.03% 7.424us 2.475us 3 - Activity Buffer Request 70.32% 1.444ms 70.32% 1.444ms 1.444ms 2.625us 6.37% 2.625us 2.625us 1 - aten::empty_strided 1.46% 30.070us 1.46% 30.070us 5.012us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.28% 211.144us 10.28% 211.144us 23.460us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.92% 18.949us 1.19% 24.411us 2.712us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.313us 0.45% 9.313us 0.621us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.52% 10.601us 0.52% 10.601us 3.534us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.44% 9.110us 0.44% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.930us 0.36% 7.410us 2.470us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.382us 781.00% 324.382us 324.382us 1 + torch_eager 6.15% 143.693us 99.76% 2.329ms 2.329ms 0.000us 0.00% 44.158us 44.158us 1 + aten::conv1d 0.25% 5.870us 4.90% 114.381us 38.127us 0.000us 0.00% 25.694us 8.565us 3 + aten::convolution 0.39% 9.129us 4.65% 108.511us 36.170us 0.000us 0.00% 25.694us 8.565us 3 + aten::_convolution 0.92% 21.560us 4.26% 99.382us 33.127us 0.000us 0.00% 25.694us 8.565us 3 + aten::_conv_depthwise2d 0.91% 21.251us 2.67% 62.331us 20.777us 25.694us 61.86% 25.694us 8.565us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.694us 61.86% 25.694us 8.565us 3 + aten::to 0.26% 6.051us 87.64% 2.046ms 341.007us 0.000us 0.00% 18.464us 3.077us 6 + aten::_to_copy 0.99% 23.033us 87.38% 2.040ms 339.999us 0.000us 0.00% 18.464us 3.077us 6 + aten::copy_ 2.09% 48.709us 85.05% 1.985ms 330.910us 15.840us 38.14% 18.464us 3.077us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 20.34% 8.448us 2.816us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 17.80% 7.392us 2.464us 3 + Activity Buffer Request 74.80% 1.746ms 74.80% 1.746ms 1.746ms 2.624us 6.32% 2.624us 2.624us 1 + aten::empty_strided 1.35% 31.498us 1.35% 31.498us 5.250us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.10% 212.334us 9.10% 212.334us 23.593us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.70% 16.311us 0.92% 21.550us 2.394us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.38% 8.780us 0.38% 8.780us 0.585us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.39% 9.170us 0.39% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 10.170us 0.44% 10.170us 3.390us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.24% 5.530us 0.30% 6.891us 2.297us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.054ms -Self CUDA time total: 41.185us +Self CPU time total: 2.335ms +Self CUDA time total: 41.534us @@ -4473,29 +4255,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.348us 338.39% 348.348us 348.348us 1 - torch_eager 7.21% 148.863us 99.73% 2.059ms 2.059ms 0.000us 0.00% 108.926us 108.926us 1 - aten::conv1d 0.31% 6.430us 5.95% 122.893us 40.964us 0.000us 0.00% 70.592us 23.531us 3 - aten::convolution 0.50% 10.290us 5.64% 116.463us 38.821us 0.000us 0.00% 70.592us 23.531us 3 - aten::_convolution 1.17% 24.211us 5.14% 106.173us 35.391us 0.000us 0.00% 70.592us 23.531us 3 - aten::_conv_depthwise2d 1.12% 23.052us 3.16% 65.282us 21.761us 70.592us 68.57% 70.592us 23.531us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.592us 68.57% 70.592us 23.531us 3 - aten::to 0.31% 6.372us 85.15% 1.758ms 292.949us 0.000us 0.00% 38.334us 6.389us 6 - aten::_to_copy 1.20% 24.680us 84.84% 1.751ms 291.887us 0.000us 0.00% 38.334us 6.389us 6 - aten::copy_ 2.47% 51.072us 82.20% 1.697ms 282.787us 32.350us 31.43% 38.334us 6.389us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 17.19% 17.695us 5.898us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 14.24% 14.655us 4.885us 3 - Activity Buffer Request 70.59% 1.457ms 70.59% 1.457ms 1.457ms 5.984us 5.81% 5.984us 5.984us 1 - aten::empty_strided 1.45% 29.921us 1.45% 29.921us 4.987us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.23% 211.264us 10.23% 211.264us 23.474us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 18.462us 1.17% 24.111us 2.679us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.47% 9.709us 0.47% 9.709us 0.647us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.47% 9.780us 0.47% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.740us 0.47% 9.740us 3.247us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.880us 0.35% 7.260us 2.420us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.038us 307.34% 319.038us 319.038us 1 + torch_eager 4.95% 115.620us 99.75% 2.329ms 2.329ms 0.000us 0.00% 109.886us 109.886us 1 + aten::conv1d 0.24% 5.500us 4.79% 111.722us 37.241us 0.000us 0.00% 71.360us 23.787us 3 + aten::convolution 0.38% 8.820us 4.55% 106.222us 35.407us 0.000us 0.00% 71.360us 23.787us 3 + aten::_convolution 0.86% 20.169us 4.17% 97.402us 32.467us 0.000us 0.00% 71.360us 23.787us 3 + aten::_conv_depthwise2d 0.88% 20.499us 2.70% 62.992us 20.997us 71.360us 68.74% 71.360us 23.787us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 71.360us 68.74% 71.360us 23.787us 3 + aten::to 0.25% 5.942us 88.99% 2.078ms 346.257us 0.000us 0.00% 38.526us 6.421us 6 + aten::_to_copy 0.97% 22.531us 88.74% 2.072ms 345.267us 0.000us 0.00% 38.526us 6.421us 6 + aten::copy_ 1.95% 45.459us 86.50% 2.019ms 336.557us 32.447us 31.26% 38.526us 6.421us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.791us 17.14% 17.791us 5.930us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.656us 14.12% 14.656us 4.885us 3 + Activity Buffer Request 76.44% 1.784ms 76.44% 1.784ms 1.784ms 6.079us 5.86% 6.079us 6.079us 1 + aten::empty_strided 1.27% 29.730us 1.27% 29.730us 4.955us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.13% 213.066us 9.13% 213.066us 23.674us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.66% 15.410us 0.85% 19.870us 2.208us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.33% 7.790us 0.33% 7.790us 0.519us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.44% 10.351us 0.44% 10.351us 3.450us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.36% 8.461us 0.36% 8.461us 2.820us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.23% 5.401us 0.29% 6.691us 2.230us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.064ms -Self CUDA time total: 102.942us +Self CPU time total: 2.335ms +Self CUDA time total: 103.807us @@ -4505,29 +4287,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.181us 304.53% 344.181us 344.181us 1 - torch_eager 14.98% 124.863us 99.35% 828.302us 828.302us 0.000us 0.00% 119.036us 119.036us 1 - aten::conv1d 0.70% 5.870us 14.55% 121.343us 40.448us 0.000us 0.00% 80.669us 26.890us 3 - aten::convolution 1.17% 9.720us 13.85% 115.473us 38.491us 0.000us 0.00% 80.669us 26.890us 3 - aten::_convolution 2.96% 24.691us 12.68% 105.753us 35.251us 0.000us 0.00% 80.669us 26.890us 3 - aten::_conv_depthwise2d 2.65% 22.121us 7.65% 63.762us 21.254us 80.669us 71.38% 80.669us 26.890us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.669us 71.38% 80.669us 26.890us 3 - aten::to 0.77% 6.429us 66.53% 554.705us 92.451us 0.000us 0.00% 38.367us 6.394us 6 - aten::_to_copy 3.01% 25.101us 65.76% 548.276us 91.379us 0.000us 0.00% 38.367us 6.394us 6 - aten::copy_ 6.16% 51.352us 59.05% 492.343us 82.057us 32.351us 28.62% 38.367us 6.394us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 15.66% 17.696us 5.899us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 12.97% 14.655us 4.885us 3 - Activity Buffer Request 28.81% 240.197us 28.81% 240.197us 240.197us 6.016us 5.32% 6.016us 6.016us 1 - aten::empty_strided 3.70% 30.832us 3.70% 30.832us 5.139us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 26.65% 222.174us 26.65% 222.174us 24.686us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.09% 17.401us 2.70% 22.541us 2.505us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.05% 8.790us 1.05% 8.790us 0.586us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.34% 11.151us 1.34% 11.151us 3.717us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.09% 9.110us 1.09% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.89% 7.450us 1.05% 8.790us 2.930us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.032us 281.56% 320.032us 320.032us 1 + torch_eager 4.89% 112.502us 99.77% 2.297ms 2.297ms 0.000us 0.00% 119.649us 119.649us 1 + aten::conv1d 0.24% 5.540us 4.86% 111.980us 37.327us 0.000us 0.00% 81.407us 27.136us 3 + aten::convolution 0.38% 8.839us 4.62% 106.440us 35.480us 0.000us 0.00% 81.407us 27.136us 3 + aten::_convolution 0.90% 20.821us 4.24% 97.601us 32.534us 0.000us 0.00% 81.407us 27.136us 3 + aten::_conv_depthwise2d 0.94% 21.639us 2.69% 61.990us 20.663us 81.407us 71.62% 81.407us 27.136us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 81.407us 71.62% 81.407us 27.136us 3 + aten::to 0.26% 5.912us 88.93% 2.047ms 341.211us 0.000us 0.00% 38.242us 6.374us 6 + aten::_to_copy 0.96% 22.099us 88.68% 2.041ms 340.225us 0.000us 0.00% 38.242us 6.374us 6 + aten::copy_ 2.13% 49.062us 86.51% 1.991ms 331.902us 32.257us 28.38% 38.242us 6.374us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.665us 15.54% 17.665us 5.888us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 12.84% 14.592us 4.864us 3 + Activity Buffer Request 76.05% 1.751ms 76.05% 1.751ms 1.751ms 5.985us 5.27% 5.985us 5.985us 1 + aten::empty_strided 1.21% 27.841us 1.21% 27.841us 4.640us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.26% 213.213us 9.26% 213.213us 23.690us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.70% 16.150us 0.91% 21.061us 2.340us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.36% 8.381us 0.36% 8.381us 0.559us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.40% 9.130us 0.40% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.42% 9.600us 0.42% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.24% 5.419us 0.29% 6.669us 2.223us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 833.752us -Self CUDA time total: 113.020us +Self CPU time total: 2.302ms +Self CUDA time total: 113.664us @@ -4537,29 +4319,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 14.21% 122.455us 95.83% 825.681us 825.681us 0.000us 0.00% 433.339us 433.339us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 419.771us 106.59% 419.771us 419.771us 1 - aten::conv1d 0.75% 6.429us 14.10% 121.522us 40.507us 0.000us 0.00% 251.453us 83.818us 3 - aten::convolution 1.15% 9.929us 13.36% 115.093us 38.364us 0.000us 0.00% 251.453us 83.818us 3 - aten::_convolution 2.67% 23.042us 12.21% 105.164us 35.055us 0.000us 0.00% 251.453us 83.818us 3 - aten::_conv_depthwise2d 2.60% 22.440us 7.52% 64.810us 21.603us 251.453us 63.85% 251.453us 83.818us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.453us 63.85% 251.453us 83.818us 3 - aten::to 0.70% 6.001us 64.14% 552.672us 92.112us 0.000us 0.00% 181.886us 30.314us 6 - aten::_to_copy 2.73% 23.540us 63.45% 546.671us 91.112us 0.000us 0.00% 181.886us 30.314us 6 - aten::copy_ 5.94% 51.140us 57.36% 494.211us 82.368us 142.367us 36.15% 181.886us 30.314us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.367us 25.99% 102.367us 34.122us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.000us 10.16% 40.000us 13.333us 3 - Activity Buffer Request 29.04% 250.247us 29.04% 250.247us 250.247us 39.519us 10.03% 39.519us 39.519us 1 - aten::empty_strided 3.36% 28.920us 3.36% 28.920us 4.820us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.89% 214.494us 24.89% 214.494us 23.833us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.98% 17.062us 2.59% 22.273us 2.475us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.09% 9.391us 1.09% 9.391us 0.626us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.24% 10.660us 1.24% 10.660us 3.553us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.17% 10.040us 1.17% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.86% 7.370us 1.02% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 4.70% 113.641us 96.03% 2.320ms 2.320ms 0.000us 0.00% 464.763us 464.763us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 453.786us 106.62% 453.786us 453.786us 1 + aten::conv1d 0.23% 5.630us 4.62% 111.673us 37.224us 0.000us 0.00% 278.940us 92.980us 3 + aten::convolution 0.36% 8.651us 4.39% 106.043us 35.348us 0.000us 0.00% 278.940us 92.980us 3 + aten::_convolution 0.86% 20.739us 4.03% 97.392us 32.464us 0.000us 0.00% 278.940us 92.980us 3 + aten::_conv_depthwise2d 0.90% 21.710us 2.57% 62.062us 20.687us 278.940us 65.54% 278.940us 92.980us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 278.940us 65.54% 278.940us 92.980us 3 + aten::to 0.24% 5.880us 85.69% 2.071ms 345.102us 0.000us 0.00% 185.823us 30.970us 6 + aten::_to_copy 0.90% 21.820us 85.45% 2.065ms 344.122us 0.000us 0.00% 185.823us 30.970us 6 + aten::copy_ 1.99% 48.071us 83.40% 2.015ms 335.882us 146.655us 34.46% 185.823us 30.970us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 105.919us 24.89% 105.919us 35.306us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.736us 9.57% 40.736us 13.579us 3 + Activity Buffer Request 72.26% 1.746ms 72.26% 1.746ms 1.746ms 39.168us 9.20% 39.168us 39.168us 1 + aten::empty_strided 1.14% 27.621us 1.14% 27.621us 4.604us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.07% 243.344us 10.07% 243.344us 27.038us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.66% 15.908us 0.86% 20.760us 2.307us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.34% 8.262us 0.34% 8.262us 0.551us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.37% 8.921us 0.37% 8.921us 2.974us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.38% 9.260us 0.38% 9.260us 3.087us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.361us 0.27% 6.641us 2.214us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 861.602us -Self CUDA time total: 393.820us +Self CPU time total: 2.416ms +Self CUDA time total: 425.595us @@ -4569,29 +4351,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 15.32% 134.312us 91.67% 803.971us 803.971us 0.000us 0.00% 487.924us 487.924us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 476.501us 106.34% 476.501us 476.501us 1 - aten::conv1d 0.67% 5.860us 13.82% 121.173us 40.391us 0.000us 0.00% 299.161us 99.720us 3 - aten::convolution 1.17% 10.220us 13.15% 115.313us 38.438us 0.000us 0.00% 299.161us 99.720us 3 - aten::_convolution 2.67% 23.450us 11.98% 105.093us 35.031us 0.000us 0.00% 299.161us 99.720us 3 - aten::_conv_depthwise2d 2.56% 22.451us 7.48% 65.623us 21.874us 299.161us 66.76% 299.161us 99.720us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.161us 66.76% 299.161us 99.720us 3 - aten::to 0.69% 6.051us 59.17% 518.906us 86.484us 0.000us 0.00% 188.763us 31.460us 6 - aten::_to_copy 2.71% 23.771us 58.48% 512.855us 85.476us 0.000us 0.00% 188.763us 31.460us 6 - aten::copy_ 5.69% 49.880us 52.31% 458.742us 76.457us 148.924us 33.24% 188.763us 31.460us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.861us 24.29% 108.861us 36.287us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.063us 8.94% 40.063us 13.354us 3 - Activity Buffer Request 25.01% 219.366us 25.01% 219.366us 219.366us 39.839us 8.89% 39.839us 39.839us 1 - aten::empty_strided 3.46% 30.342us 3.46% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.34% 213.439us 24.34% 213.439us 23.715us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.98% 17.400us 2.59% 22.720us 2.524us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.09% 9.540us 1.09% 9.540us 0.636us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.14% 10.010us 1.14% 10.010us 3.337us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.05% 9.219us 1.05% 9.219us 3.073us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.66% 5.750us 0.82% 7.210us 2.403us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 4.81% 115.230us 95.51% 2.289ms 2.289ms 0.000us 0.00% 473.560us 473.560us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 466.268us 106.59% 466.268us 466.268us 1 + aten::conv1d 0.23% 5.540us 4.63% 111.002us 37.001us 0.000us 0.00% 298.430us 99.477us 3 + aten::convolution 0.37% 8.900us 4.40% 105.462us 35.154us 0.000us 0.00% 298.430us 99.477us 3 + aten::_convolution 0.85% 20.430us 4.03% 96.562us 32.187us 0.000us 0.00% 298.430us 99.477us 3 + aten::_conv_depthwise2d 0.86% 20.562us 2.57% 61.592us 20.531us 298.430us 68.22% 298.430us 99.477us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.430us 68.22% 298.430us 99.477us 3 + aten::to 0.24% 5.669us 85.05% 2.039ms 339.802us 0.000us 0.00% 175.130us 29.188us 6 + aten::_to_copy 0.96% 22.942us 84.82% 2.033ms 338.857us 0.000us 0.00% 175.130us 29.188us 6 + aten::copy_ 2.01% 48.190us 82.64% 1.981ms 330.170us 139.003us 31.78% 175.130us 29.188us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 98.430us 22.50% 98.430us 32.810us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.573us 9.28% 40.573us 13.524us 3 + Activity Buffer Request 72.81% 1.745ms 72.81% 1.745ms 1.745ms 36.127us 8.26% 36.127us 36.127us 1 + aten::empty_strided 1.22% 29.180us 1.22% 29.180us 4.863us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.73% 209.224us 8.73% 209.224us 23.247us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.66% 15.770us 0.87% 20.750us 2.306us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.35% 8.340us 0.35% 8.340us 0.556us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.43% 10.290us 0.43% 10.290us 3.430us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.37% 8.960us 0.37% 8.960us 2.987us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.340us 0.28% 6.610us 2.203us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 876.983us -Self CUDA time total: 448.085us +Self CPU time total: 2.397ms +Self CUDA time total: 437.433us @@ -4601,29 +4383,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.392us 1804.85% 338.392us 338.392us 1 - torch_eager 18.33% 161.236us 99.35% 873.703us 873.703us 0.000us 0.00% 20.637us 20.637us 1 - aten::to 0.69% 6.070us 63.71% 560.224us 93.371us 0.000us 0.00% 13.406us 2.234us 6 - aten::_to_copy 2.78% 24.471us 63.02% 554.154us 92.359us 0.000us 0.00% 13.406us 2.234us 6 - aten::copy_ 5.94% 52.212us 56.85% 499.953us 83.325us 11.518us 61.43% 13.406us 2.234us 6 - aten::conv1d 0.64% 5.659us 14.02% 123.282us 41.094us 0.000us 0.00% 7.231us 2.410us 3 - aten::convolution 1.14% 9.999us 13.38% 117.623us 39.208us 0.000us 0.00% 7.231us 2.410us 3 - aten::_convolution 2.72% 23.952us 12.24% 107.624us 35.875us 0.000us 0.00% 7.231us 2.410us 3 - aten::_conv_depthwise2d 2.67% 23.519us 7.63% 67.130us 22.377us 7.231us 38.57% 7.231us 2.410us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 38.57% 7.231us 2.410us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.854us 31.22% 5.854us 1.951us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.21% 5.664us 1.888us 3 - Activity Buffer Request 29.52% 259.596us 29.52% 259.596us 259.596us 1.888us 10.07% 1.888us 1.888us 1 - aten::empty_strided 3.38% 29.730us 3.38% 29.730us 4.955us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 23.99% 210.946us 23.99% 210.946us 23.438us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.07% 18.190us 2.71% 23.871us 2.652us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.11% 9.761us 1.11% 9.761us 0.651us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.24% 10.890us 1.24% 10.890us 3.630us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.13% 9.920us 1.13% 9.920us 3.307us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.68% 5.972us 0.85% 7.452us 2.484us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.149us 1725.02% 325.149us 325.149us 1 + torch_eager 4.86% 112.628us 99.78% 2.311ms 2.311ms 0.000us 0.00% 20.769us 20.769us 1 + aten::to 0.26% 5.932us 88.67% 2.054ms 342.251us 0.000us 0.00% 13.536us 2.256us 6 + aten::_to_copy 1.00% 23.270us 88.41% 2.048ms 341.262us 0.000us 0.00% 13.536us 2.256us 6 + aten::copy_ 2.14% 49.511us 86.15% 1.995ms 332.552us 11.616us 61.63% 13.536us 2.256us 6 + aten::conv1d 0.24% 5.480us 5.19% 120.221us 40.074us 0.000us 0.00% 7.233us 2.411us 3 + aten::convolution 0.37% 8.641us 4.95% 114.741us 38.247us 0.000us 0.00% 7.233us 2.411us 3 + aten::_convolution 0.88% 20.361us 4.58% 106.100us 35.367us 0.000us 0.00% 7.233us 2.411us 3 + aten::_conv_depthwise2d 0.96% 22.180us 3.05% 70.680us 23.560us 7.233us 38.37% 7.233us 2.411us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.233us 38.37% 7.233us 2.411us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.41% 5.920us 1.973us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.22% 5.696us 1.899us 3 + Activity Buffer Request 75.90% 1.758ms 75.90% 1.758ms 1.758ms 1.920us 10.19% 1.920us 1.920us 1 + aten::empty_strided 1.25% 28.990us 1.25% 28.990us 4.832us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.42% 218.162us 9.42% 218.162us 24.240us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.68% 15.833us 0.90% 20.731us 2.303us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.37% 8.468us 0.37% 8.468us 0.565us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.40% 9.220us 0.40% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.39% 8.980us 0.39% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.24% 5.550us 0.30% 7.000us 2.333us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 879.393us -Self CUDA time total: 18.749us +Self CPU time total: 2.316ms +Self CUDA time total: 18.849us @@ -4633,29 +4415,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.934us 1741.87% 338.934us 338.934us 1 - torch_eager 16.71% 145.362us 99.29% 863.592us 863.592us 0.000us 0.00% 21.314us 21.314us 1 - aten::to 0.71% 6.200us 65.36% 568.524us 94.754us 0.000us 0.00% 13.282us 2.214us 6 - aten::_to_copy 2.85% 24.831us 64.65% 562.324us 93.721us 0.000us 0.00% 13.282us 2.214us 6 - aten::copy_ 5.81% 50.550us 58.39% 507.883us 84.647us 11.426us 58.72% 13.282us 2.214us 6 - aten::conv1d 0.78% 6.753us 14.06% 122.315us 40.772us 0.000us 0.00% 8.032us 2.677us 3 - aten::convolution 1.19% 10.380us 13.29% 115.562us 38.521us 0.000us 0.00% 8.032us 2.677us 3 - aten::_convolution 2.63% 22.841us 12.09% 105.182us 35.061us 0.000us 0.00% 8.032us 2.677us 3 - aten::_conv_depthwise2d 2.65% 23.042us 7.65% 66.512us 22.171us 8.032us 41.28% 8.032us 2.677us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 41.28% 8.032us 2.677us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 29.94% 5.825us 1.942us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.601us 28.79% 5.601us 1.867us 3 - Activity Buffer Request 30.62% 266.307us 30.62% 266.307us 266.307us 1.856us 9.54% 1.856us 1.856us 1 - aten::empty_strided 3.40% 29.610us 3.40% 29.610us 4.935us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.61% 214.076us 24.61% 214.076us 23.786us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.02% 17.612us 2.63% 22.841us 2.538us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.02% 8.840us 1.02% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.22% 10.630us 1.22% 10.630us 3.543us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.13% 9.790us 1.13% 9.790us 3.263us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.67% 5.798us 0.82% 7.109us 2.370us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.511us 1636.76% 320.511us 320.511us 1 + torch_eager 5.91% 139.372us 99.79% 2.353ms 2.353ms 0.000us 0.00% 21.598us 21.598us 1 + aten::to 0.25% 6.010us 87.93% 2.073ms 345.496us 0.000us 0.00% 13.663us 2.277us 6 + aten::_to_copy 0.96% 22.549us 87.67% 2.067ms 344.494us 0.000us 0.00% 13.663us 2.277us 6 + aten::copy_ 2.09% 49.251us 85.51% 2.016ms 335.977us 11.647us 59.48% 13.663us 2.277us 6 + aten::conv1d 0.26% 6.081us 4.89% 115.321us 38.440us 0.000us 0.00% 7.935us 2.645us 3 + aten::convolution 0.40% 9.450us 4.63% 109.240us 36.413us 0.000us 0.00% 7.935us 2.645us 3 + aten::_convolution 0.90% 21.168us 4.23% 99.790us 33.263us 0.000us 0.00% 7.935us 2.645us 3 + aten::_conv_depthwise2d 0.87% 20.610us 2.67% 62.871us 20.957us 7.935us 40.52% 7.935us 2.645us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 40.52% 7.935us 2.645us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 30.55% 5.983us 1.994us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 28.92% 5.664us 1.888us 3 + Activity Buffer Request 75.47% 1.779ms 75.47% 1.779ms 1.779ms 2.016us 10.30% 2.016us 2.016us 1 + aten::empty_strided 1.21% 28.551us 1.21% 28.551us 4.759us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.91% 210.105us 8.91% 210.105us 23.345us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.72% 16.961us 0.93% 21.872us 2.430us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.36% 8.422us 0.36% 8.422us 0.561us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.46% 10.910us 0.46% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.37% 8.650us 0.37% 8.650us 2.883us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.24% 5.579us 0.30% 6.970us 2.323us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 869.783us -Self CUDA time total: 19.458us +Self CPU time total: 2.358ms +Self CUDA time total: 19.582us @@ -4665,29 +4447,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.862us 1751.78% 340.862us 340.862us 1 - torch_eager 8.44% 173.073us 99.74% 2.045ms 2.045ms 0.000us 0.00% 21.635us 21.635us 1 - aten::to 0.33% 6.670us 84.06% 1.723ms 287.196us 0.000us 0.00% 14.307us 2.385us 6 - aten::_to_copy 1.21% 24.883us 83.74% 1.717ms 286.084us 0.000us 0.00% 14.307us 2.385us 6 - aten::copy_ 2.36% 48.471us 81.06% 1.662ms 276.949us 12.130us 62.34% 14.307us 2.385us 6 - aten::conv1d 0.29% 5.970us 5.84% 119.613us 39.871us 0.000us 0.00% 7.328us 2.443us 3 - aten::convolution 0.48% 9.780us 5.54% 113.643us 37.881us 0.000us 0.00% 7.328us 2.443us 3 - aten::_convolution 1.14% 23.420us 5.07% 103.863us 34.621us 0.000us 0.00% 7.328us 2.443us 3 - aten::_conv_depthwise2d 1.10% 22.512us 3.15% 64.503us 21.501us 7.328us 37.66% 7.328us 2.443us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.66% 7.328us 2.443us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 32.07% 6.241us 2.080us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 30.27% 5.889us 1.963us 3 - Activity Buffer Request 69.34% 1.421ms 69.34% 1.421ms 1.421ms 2.177us 11.19% 2.177us 2.177us 1 - aten::empty_strided 1.46% 29.930us 1.46% 29.930us 4.988us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.50% 215.256us 10.50% 215.256us 23.917us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.86% 17.669us 1.13% 23.180us 2.576us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.47% 9.581us 0.47% 9.581us 0.639us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.759us 0.48% 9.759us 3.253us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.43% 8.742us 0.43% 8.742us 2.914us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.760us 0.35% 7.110us 2.370us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 310.009us 1591.01% 310.009us 310.009us 1 + torch_eager 14.85% 113.881us 99.35% 762.102us 762.102us 0.000us 0.00% 21.693us 21.693us 1 + aten::to 0.75% 5.742us 67.36% 516.710us 86.118us 0.000us 0.00% 14.398us 2.400us 6 + aten::_to_copy 2.84% 21.798us 66.61% 510.968us 85.161us 0.000us 0.00% 14.398us 2.400us 6 + aten::copy_ 6.26% 48.021us 59.81% 458.808us 76.468us 12.190us 62.56% 14.398us 2.400us 6 + aten::conv1d 0.69% 5.290us 14.07% 107.951us 35.984us 0.000us 0.00% 7.295us 2.432us 3 + aten::convolution 1.14% 8.770us 13.38% 102.661us 34.220us 0.000us 0.00% 7.295us 2.432us 3 + aten::_convolution 2.56% 19.629us 12.24% 93.891us 31.297us 0.000us 0.00% 7.295us 2.432us 3 + aten::_conv_depthwise2d 2.72% 20.851us 7.84% 60.152us 20.051us 7.295us 37.44% 7.295us 2.432us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.295us 37.44% 7.295us 2.432us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.271us 32.18% 6.271us 2.090us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.38% 5.919us 1.973us 3 + Activity Buffer Request 29.70% 227.833us 29.70% 227.833us 227.833us 2.208us 11.33% 2.208us 2.208us 1 + aten::empty_strided 3.96% 30.362us 3.96% 30.362us 5.060us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.62% 204.185us 26.62% 204.185us 22.687us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.01% 15.431us 2.57% 19.700us 2.189us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.98% 7.520us 0.98% 7.520us 0.501us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.29% 9.930us 1.29% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.06% 8.140us 1.06% 8.140us 2.713us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.67% 5.119us 0.83% 6.400us 2.133us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.050ms -Self CUDA time total: 19.458us +Self CPU time total: 767.122us +Self CUDA time total: 19.485us @@ -4697,29 +4479,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.067us 1820.95% 367.067us 367.067us 1 - torch_eager 17.50% 145.595us 99.30% 826.111us 826.111us 0.000us 0.00% 22.366us 22.366us 1 - aten::to 0.75% 6.199us 63.72% 530.082us 88.347us 0.000us 0.00% 14.431us 2.405us 6 - aten::_to_copy 2.95% 24.573us 62.97% 523.883us 87.314us 0.000us 0.00% 14.431us 2.405us 6 - aten::copy_ 6.31% 52.521us 56.15% 467.170us 77.862us 12.223us 60.64% 14.431us 2.405us 6 - aten::conv1d 0.69% 5.760us 14.59% 121.354us 40.451us 0.000us 0.00% 7.935us 2.645us 3 - aten::convolution 1.24% 10.281us 13.89% 115.594us 38.531us 0.000us 0.00% 7.935us 2.645us 3 - aten::_convolution 2.68% 22.269us 12.66% 105.313us 35.104us 0.000us 0.00% 7.935us 2.645us 3 - aten::_conv_depthwise2d 2.73% 22.701us 8.02% 66.711us 22.237us 7.935us 39.36% 7.935us 2.645us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.36% 7.935us 2.645us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.27% 6.304us 2.101us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 29.36% 5.919us 1.973us 3 - Activity Buffer Request 27.00% 224.665us 27.00% 224.665us 224.665us 2.208us 10.95% 2.208us 2.208us 1 - aten::empty_strided 3.86% 32.140us 3.86% 32.140us 5.357us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.71% 213.894us 25.71% 213.894us 23.766us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.05% 17.041us 2.71% 22.553us 2.506us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.14% 9.503us 1.14% 9.503us 0.634us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.31% 10.920us 1.31% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.10% 9.180us 1.10% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.81% 6.740us 0.98% 8.160us 2.720us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 312.058us 1547.83% 312.058us 312.058us 1 + torch_eager 19.84% 167.701us 99.34% 839.603us 839.603us 0.000us 0.00% 22.369us 22.369us 1 + aten::to 0.69% 5.791us 63.55% 537.169us 89.528us 0.000us 0.00% 14.400us 2.400us 6 + aten::_to_copy 2.59% 21.910us 62.87% 531.378us 88.563us 0.000us 0.00% 14.400us 2.400us 6 + aten::copy_ 5.79% 48.970us 56.91% 481.028us 80.171us 12.192us 60.47% 14.400us 2.400us 6 + aten::conv1d 0.65% 5.520us 13.10% 110.752us 36.917us 0.000us 0.00% 7.969us 2.656us 3 + aten::convolution 1.03% 8.700us 12.45% 105.232us 35.077us 0.000us 0.00% 7.969us 2.656us 3 + aten::_convolution 2.40% 20.311us 11.42% 96.532us 32.177us 0.000us 0.00% 7.969us 2.656us 3 + aten::_conv_depthwise2d 2.39% 20.240us 7.28% 61.521us 20.507us 7.969us 39.53% 7.969us 2.656us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 39.53% 7.969us 2.656us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 31.11% 6.272us 2.091us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.36% 5.920us 1.973us 3 + Activity Buffer Request 29.19% 246.714us 29.19% 246.714us 246.714us 2.208us 10.95% 2.208us 2.208us 1 + aten::empty_strided 3.36% 28.440us 3.36% 28.440us 4.740us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.70% 208.775us 24.70% 208.775us 23.197us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.84% 15.580us 2.41% 20.350us 2.261us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.95% 8.049us 0.95% 8.049us 0.537us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.07% 9.050us 1.07% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.04% 8.800us 1.04% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.63% 5.361us 0.79% 6.650us 2.217us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 831.951us -Self CUDA time total: 20.158us +Self CPU time total: 845.213us +Self CUDA time total: 20.161us @@ -4729,29 +4511,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 363.100us 1005.93% 363.100us 363.100us 1 - torch_eager 14.77% 122.163us 99.35% 821.971us 821.971us 0.000us 0.00% 38.688us 38.688us 1 - aten::conv1d 0.72% 5.951us 17.29% 143.024us 47.675us 0.000us 0.00% 20.160us 6.720us 3 - aten::convolution 1.22% 10.110us 16.57% 137.073us 45.691us 0.000us 0.00% 20.160us 6.720us 3 - aten::_convolution 3.04% 25.151us 15.35% 126.963us 42.321us 0.000us 0.00% 20.160us 6.720us 3 - aten::_conv_depthwise2d 4.80% 39.711us 10.31% 85.271us 28.424us 20.160us 55.85% 20.160us 6.720us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.160us 55.85% 20.160us 6.720us 3 - aten::to 0.75% 6.172us 63.79% 527.804us 87.967us 0.000us 0.00% 18.528us 3.088us 6 - aten::_to_copy 2.99% 24.751us 63.05% 521.632us 86.939us 0.000us 0.00% 18.528us 3.088us 6 - aten::copy_ 6.14% 50.790us 56.45% 467.021us 77.837us 15.936us 44.15% 18.528us 3.088us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.512us 23.58% 8.512us 2.837us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 20.57% 7.424us 2.475us 3 - Activity Buffer Request 27.93% 231.066us 27.93% 231.066us 231.066us 2.592us 7.18% 2.592us 2.592us 1 - aten::empty_strided 3.61% 29.860us 3.61% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.33% 209.585us 25.33% 209.585us 23.287us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.11% 17.441us 2.75% 22.791us 2.532us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.15% 9.501us 1.15% 9.501us 0.633us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.26% 10.400us 1.26% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.30% 10.740us 1.30% 10.740us 3.580us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.76% 6.269us 0.93% 7.730us 2.577us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 312.867us 859.95% 312.867us 312.867us 1 + torch_eager 14.44% 112.752us 99.36% 776.042us 776.042us 0.000us 0.00% 39.006us 39.006us 1 + aten::conv1d 0.71% 5.580us 13.99% 109.252us 36.417us 0.000us 0.00% 20.512us 6.837us 3 + aten::convolution 1.09% 8.531us 13.27% 103.672us 34.557us 0.000us 0.00% 20.512us 6.837us 3 + aten::_convolution 2.62% 20.459us 12.18% 95.141us 31.714us 0.000us 0.00% 20.512us 6.837us 3 + aten::_conv_depthwise2d 2.59% 20.222us 7.70% 60.162us 20.054us 20.512us 56.38% 20.512us 6.837us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.512us 56.38% 20.512us 6.837us 3 + aten::to 0.75% 5.821us 67.81% 529.608us 88.268us 0.000us 0.00% 18.494us 3.082us 6 + aten::_to_copy 2.86% 22.338us 67.06% 523.787us 87.298us 0.000us 0.00% 18.494us 3.082us 6 + aten::copy_ 6.02% 47.020us 60.45% 472.148us 78.691us 15.870us 43.62% 18.494us 3.082us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.447us 23.22% 8.447us 2.816us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.423us 20.40% 7.423us 2.474us 3 + Activity Buffer Request 30.80% 240.594us 30.80% 240.594us 240.594us 2.624us 7.21% 2.624us 2.624us 1 + aten::empty_strided 3.75% 29.301us 3.75% 29.301us 4.884us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.46% 206.633us 26.46% 206.633us 22.959us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.01% 15.720us 2.61% 20.410us 2.268us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.02% 7.981us 1.02% 7.981us 0.532us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.13% 8.841us 1.13% 8.841us 2.947us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.15% 9.000us 1.15% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.68% 5.329us 0.84% 6.560us 2.187us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 827.381us -Self CUDA time total: 36.096us +Self CPU time total: 781.073us +Self CUDA time total: 36.382us @@ -4761,29 +4543,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.025us 883.88% 336.025us 336.025us 1 - torch_eager 14.70% 120.902us 99.36% 817.351us 817.351us 0.000us 0.00% 40.610us 40.610us 1 - aten::conv1d 0.71% 5.820us 14.44% 118.823us 39.608us 0.000us 0.00% 22.304us 7.435us 3 - aten::convolution 1.12% 9.190us 13.74% 113.003us 37.668us 0.000us 0.00% 22.304us 7.435us 3 - aten::_convolution 2.83% 23.270us 12.62% 103.813us 34.604us 0.000us 0.00% 22.304us 7.435us 3 - aten::_conv_depthwise2d 2.83% 23.309us 7.79% 64.072us 21.357us 22.304us 58.67% 22.304us 7.435us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3 - aten::to 0.73% 5.990us 66.75% 549.075us 91.513us 0.000us 0.00% 18.306us 3.051us 6 - aten::_to_copy 2.91% 23.953us 66.02% 543.085us 90.514us 0.000us 0.00% 18.306us 3.051us 6 - aten::copy_ 6.07% 49.902us 59.57% 490.042us 81.674us 15.713us 41.33% 18.306us 3.051us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.353us 21.97% 8.353us 2.784us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.36% 7.360us 2.453us 3 - Activity Buffer Request 30.85% 253.806us 30.85% 253.806us 253.806us 2.593us 6.82% 2.593us 2.593us 1 - aten::empty_strided 3.54% 29.090us 3.54% 29.090us 4.848us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.29% 208.074us 25.29% 208.074us 23.119us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.19% 18.051us 2.84% 23.371us 2.597us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.11% 9.160us 1.11% 9.160us 0.611us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.21% 9.961us 1.21% 9.961us 3.320us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.10% 9.062us 1.10% 9.062us 3.021us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.80% 6.580us 0.96% 7.920us 2.640us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 353.311us 916.31% 353.311us 353.311us 1 + torch_eager 17.31% 144.171us 99.40% 827.943us 827.943us 0.000us 0.00% 41.150us 41.150us 1 + aten::conv1d 0.66% 5.470us 14.12% 117.601us 39.200us 0.000us 0.00% 22.624us 7.541us 3 + aten::convolution 1.09% 9.120us 13.46% 112.131us 37.377us 0.000us 0.00% 22.624us 7.541us 3 + aten::_convolution 2.77% 23.100us 12.37% 103.011us 34.337us 0.000us 0.00% 22.624us 7.541us 3 + aten::_conv_depthwise2d 2.63% 21.901us 7.78% 64.791us 21.597us 22.624us 58.68% 22.624us 7.541us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.624us 58.68% 22.624us 7.541us 3 + aten::to 0.71% 5.920us 64.88% 540.450us 90.075us 0.000us 0.00% 18.526us 3.088us 6 + aten::_to_copy 2.59% 21.613us 64.17% 534.530us 89.088us 0.000us 0.00% 18.526us 3.088us 6 + aten::copy_ 5.88% 48.990us 58.06% 483.646us 80.608us 15.934us 41.32% 18.526us 3.088us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.575us 22.24% 8.575us 2.858us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.09% 7.359us 2.453us 3 + Activity Buffer Request 29.91% 249.164us 29.91% 249.164us 249.164us 2.592us 6.72% 2.592us 2.592us 1 + aten::empty_strided 3.51% 29.271us 3.51% 29.271us 4.879us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.18% 209.712us 25.18% 209.712us 23.301us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.99% 16.542us 2.59% 21.611us 2.401us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.04% 8.638us 1.04% 8.638us 0.576us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.16% 9.650us 1.16% 9.650us 3.217us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.08% 9.020us 1.08% 9.020us 3.007us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.68% 5.681us 0.85% 7.060us 2.353us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 822.611us -Self CUDA time total: 38.017us +Self CPU time total: 832.973us +Self CUDA time total: 38.558us @@ -4793,29 +4575,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.486us 522.89% 335.486us 335.486us 1 - torch_eager 15.29% 123.163us 99.38% 800.491us 800.491us 0.000us 0.00% 68.256us 68.256us 1 - aten::conv1d 0.73% 5.840us 14.87% 119.763us 39.921us 0.000us 0.00% 41.760us 13.920us 3 - aten::convolution 1.21% 9.761us 14.14% 113.923us 37.974us 0.000us 0.00% 41.760us 13.920us 3 - aten::_convolution 2.84% 22.911us 12.93% 104.162us 34.721us 0.000us 0.00% 41.760us 13.920us 3 - aten::_conv_depthwise2d 2.80% 22.570us 8.02% 64.572us 21.524us 41.760us 65.09% 41.760us 13.920us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.760us 65.09% 41.760us 13.920us 3 - aten::to 0.73% 5.842us 65.67% 528.904us 88.151us 0.000us 0.00% 26.496us 4.416us 6 - aten::_to_copy 2.94% 23.712us 64.94% 523.062us 87.177us 0.000us 0.00% 26.496us 4.416us 6 - aten::copy_ 6.02% 48.492us 58.29% 469.521us 78.253us 22.400us 34.91% 26.496us 4.416us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 18.65% 11.968us 3.989us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.26% 10.432us 3.477us 3 - Activity Buffer Request 29.33% 236.206us 29.33% 236.206us 236.206us 4.096us 6.38% 4.096us 4.096us 1 - aten::empty_strided 3.70% 29.829us 3.70% 29.829us 4.971us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.91% 208.693us 25.91% 208.693us 23.188us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.18% 17.569us 2.86% 23.069us 2.563us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.14% 9.222us 1.14% 9.222us 0.615us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.20% 9.631us 1.20% 9.631us 3.210us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.06% 8.501us 1.06% 8.501us 2.834us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.83% 6.660us 0.99% 7.990us 2.663us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.829us 488.45% 316.829us 316.829us 1 + torch_eager 14.19% 114.002us 99.33% 798.183us 798.183us 0.000us 0.00% 68.991us 68.991us 1 + aten::conv1d 0.68% 5.460us 13.80% 110.892us 36.964us 0.000us 0.00% 42.304us 14.101us 3 + aten::convolution 1.10% 8.859us 13.12% 105.432us 35.144us 0.000us 0.00% 42.304us 14.101us 3 + aten::_convolution 2.59% 20.821us 12.02% 96.573us 32.191us 0.000us 0.00% 42.304us 14.101us 3 + aten::_conv_depthwise2d 2.64% 21.190us 7.50% 60.251us 20.084us 42.304us 65.22% 42.304us 14.101us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 42.304us 65.22% 42.304us 14.101us 3 + aten::to 0.75% 6.059us 68.35% 549.177us 91.530us 0.000us 0.00% 26.687us 4.448us 6 + aten::_to_copy 2.76% 22.169us 67.59% 543.118us 90.520us 0.000us 0.00% 26.687us 4.448us 6 + aten::copy_ 6.74% 54.161us 61.27% 492.308us 82.051us 22.560us 34.78% 26.687us 4.448us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.095us 18.65% 12.095us 4.032us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.465us 16.13% 10.465us 3.488us 3 + Activity Buffer Request 31.75% 255.134us 31.75% 255.134us 255.134us 4.127us 6.36% 4.127us 4.127us 1 + aten::empty_strided 3.56% 28.641us 3.56% 28.641us 4.773us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.49% 204.843us 25.49% 204.843us 22.760us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.06% 16.521us 2.65% 21.322us 2.369us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.02% 8.171us 1.02% 8.171us 0.545us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.14% 9.170us 1.14% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.00% 8.061us 1.00% 8.061us 2.687us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.66% 5.330us 0.81% 6.520us 2.173us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 805.451us -Self CUDA time total: 64.160us +Self CPU time total: 803.533us +Self CUDA time total: 64.864us @@ -4825,29 +4607,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.218us 487.48% 340.218us 340.218us 1 - torch_eager 15.18% 124.853us 99.38% 817.682us 817.682us 0.000us 0.00% 73.887us 73.887us 1 - aten::conv1d 0.72% 5.910us 14.57% 119.903us 39.968us 0.000us 0.00% 47.328us 15.776us 3 - aten::convolution 1.21% 9.960us 13.86% 113.993us 37.998us 0.000us 0.00% 47.328us 15.776us 3 - aten::_convolution 2.81% 23.101us 12.64% 104.033us 34.678us 0.000us 0.00% 47.328us 15.776us 3 - aten::_conv_depthwise2d 2.62% 21.561us 7.83% 64.432us 21.477us 47.328us 67.81% 47.328us 15.776us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.328us 67.81% 47.328us 15.776us 3 - aten::to 0.75% 6.180us 66.30% 545.475us 90.913us 0.000us 0.00% 26.559us 4.426us 6 - aten::_to_copy 2.97% 24.459us 65.55% 539.295us 89.882us 0.000us 0.00% 26.559us 4.426us 6 - aten::copy_ 6.14% 50.491us 58.93% 484.862us 80.810us 22.463us 32.19% 26.559us 4.426us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.24% 12.032us 4.011us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.95% 10.431us 3.477us 3 - Activity Buffer Request 30.21% 248.576us 30.21% 248.576us 248.576us 4.096us 5.87% 4.096us 4.096us 1 - aten::empty_strided 3.64% 29.974us 3.64% 29.974us 4.996us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.32% 208.345us 25.32% 208.345us 23.149us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.09% 17.201us 2.72% 22.401us 2.489us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.11% 9.120us 1.11% 9.120us 0.608us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.32% 10.899us 1.32% 10.899us 3.633us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.15% 9.422us 1.15% 9.422us 3.141us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.80% 6.580us 0.98% 8.070us 2.690us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.383us 466.25% 328.383us 328.383us 1 + torch_eager 5.82% 138.672us 99.78% 2.376ms 2.376ms 0.000us 0.00% 74.527us 74.527us 1 + aten::conv1d 0.24% 5.689us 4.87% 115.970us 38.657us 0.000us 0.00% 47.969us 15.990us 3 + aten::convolution 0.43% 10.191us 4.63% 110.281us 36.760us 0.000us 0.00% 47.969us 15.990us 3 + aten::_convolution 0.91% 21.579us 4.20% 100.090us 33.363us 0.000us 0.00% 47.969us 15.990us 3 + aten::_conv_depthwise2d 0.87% 20.670us 2.63% 62.670us 20.890us 47.969us 68.11% 47.969us 15.990us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.969us 68.11% 47.969us 15.990us 3 + aten::to 0.27% 6.430us 88.04% 2.097ms 349.464us 0.000us 0.00% 26.558us 4.426us 6 + aten::_to_copy 0.99% 23.642us 87.77% 2.090ms 348.392us 0.000us 0.00% 26.558us 4.426us 6 + aten::copy_ 2.06% 49.120us 85.54% 2.037ms 339.525us 22.462us 31.89% 26.558us 4.426us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.999us 17.04% 11.999us 4.000us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.463us 14.86% 10.463us 3.488us 3 + Activity Buffer Request 75.66% 1.802ms 75.66% 1.802ms 1.802ms 4.096us 5.82% 4.096us 4.096us 1 + aten::empty_strided 1.24% 29.560us 1.24% 29.560us 4.927us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.75% 208.373us 8.75% 208.373us 23.153us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.70% 16.782us 0.92% 21.972us 2.441us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.36% 8.520us 0.36% 8.520us 0.568us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.38% 9.160us 0.38% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 10.580us 0.44% 10.580us 3.527us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.24% 5.730us 0.29% 7.020us 2.340us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 822.752us -Self CUDA time total: 69.791us +Self CPU time total: 2.382ms +Self CUDA time total: 70.431us @@ -4857,29 +4639,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.276us 192.10% 357.276us 357.276us 1 - torch_eager 7.25% 148.445us 99.75% 2.043ms 2.043ms 0.000us 0.00% 196.063us 196.063us 1 - aten::conv1d 0.28% 5.714us 6.04% 123.725us 41.242us 0.000us 0.00% 133.535us 44.512us 3 - aten::convolution 0.50% 10.209us 5.76% 118.011us 39.337us 0.000us 0.00% 133.535us 44.512us 3 - aten::_convolution 1.22% 24.922us 5.26% 107.802us 35.934us 0.000us 0.00% 133.535us 44.512us 3 - aten::_conv_depthwise2d 1.06% 21.740us 3.25% 66.540us 22.180us 133.535us 71.80% 133.535us 44.512us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.535us 71.80% 133.535us 44.512us 3 - aten::to 0.32% 6.558us 85.01% 1.741ms 290.215us 0.000us 0.00% 62.528us 10.421us 6 - aten::_to_copy 1.28% 26.242us 84.69% 1.735ms 289.122us 0.000us 0.00% 62.528us 10.421us 6 - aten::copy_ 2.37% 48.539us 81.91% 1.678ms 279.634us 52.448us 28.20% 62.528us 10.421us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.536us 15.88% 29.536us 9.845us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 12.32% 22.912us 7.637us 3 - Activity Buffer Request 70.45% 1.443ms 70.45% 1.443ms 1.443ms 10.080us 5.42% 10.080us 10.080us 1 - aten::empty_strided 1.50% 30.691us 1.50% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.22% 209.265us 10.22% 209.265us 23.252us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.93% 19.072us 1.20% 24.640us 2.738us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.247us 0.45% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 11.270us 0.55% 11.270us 3.757us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.51% 10.520us 0.51% 10.520us 3.507us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.931us 0.35% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.351us 179.68% 336.351us 336.351us 1 + torch_eager 5.85% 142.571us 99.79% 2.430ms 2.430ms 0.000us 0.00% 197.311us 197.311us 1 + aten::conv1d 0.28% 6.741us 4.71% 114.731us 38.244us 0.000us 0.00% 134.368us 44.789us 3 + aten::convolution 0.38% 9.350us 4.43% 107.990us 35.997us 0.000us 0.00% 134.368us 44.789us 3 + aten::_convolution 0.88% 21.488us 4.05% 98.640us 32.880us 0.000us 0.00% 134.368us 44.789us 3 + aten::_conv_depthwise2d 0.83% 20.301us 2.51% 61.091us 20.364us 134.368us 71.78% 134.368us 44.789us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 134.368us 71.78% 134.368us 44.789us 3 + aten::to 0.26% 6.379us 88.22% 2.148ms 358.072us 0.000us 0.00% 62.943us 10.491us 6 + aten::_to_copy 0.93% 22.632us 87.96% 2.142ms 357.009us 0.000us 0.00% 62.943us 10.491us 6 + aten::copy_ 2.03% 49.489us 85.76% 2.089ms 348.110us 52.831us 28.22% 62.943us 10.491us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.727us 15.88% 29.727us 9.909us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.104us 12.34% 23.104us 7.701us 3 + Activity Buffer Request 76.11% 1.853ms 76.11% 1.853ms 1.853ms 10.112us 5.40% 10.112us 10.112us 1 + aten::empty_strided 1.26% 30.760us 1.26% 30.760us 5.127us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 8.55% 208.274us 8.55% 208.274us 23.142us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.71% 17.184us 0.91% 22.223us 2.469us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.34% 8.338us 0.34% 8.338us 0.556us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.38% 9.180us 0.38% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.37% 9.020us 0.37% 9.020us 3.007us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.22% 5.460us 0.27% 6.690us 2.230us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.048ms -Self CUDA time total: 185.983us +Self CPU time total: 2.435ms +Self CUDA time total: 187.199us @@ -4889,29 +4671,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.235us 170.21% 358.235us 358.235us 1 - torch_eager 15.50% 124.275us 99.34% 796.461us 796.461us 0.000us 0.00% 224.253us 224.253us 1 - aten::conv1d 0.70% 5.590us 14.78% 118.483us 39.494us 0.000us 0.00% 154.174us 51.391us 3 - aten::convolution 1.24% 9.921us 14.08% 112.893us 37.631us 0.000us 0.00% 154.174us 51.391us 3 - aten::_convolution 2.81% 22.549us 12.84% 102.972us 34.324us 0.000us 0.00% 154.174us 51.391us 3 - aten::_conv_depthwise2d 2.82% 22.632us 8.11% 65.062us 21.687us 154.174us 73.26% 154.174us 51.391us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.174us 73.26% 154.174us 51.391us 3 - aten::to 0.74% 5.971us 65.46% 524.833us 87.472us 0.000us 0.00% 70.079us 11.680us 6 - aten::_to_copy 3.23% 25.880us 64.72% 518.862us 86.477us 0.000us 0.00% 70.079us 11.680us 6 - aten::copy_ 6.33% 50.713us 57.67% 462.401us 77.067us 56.287us 26.74% 70.079us 11.680us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.248us 15.80% 33.248us 11.083us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.039us 10.95% 23.039us 7.680us 3 - Activity Buffer Request 28.19% 225.995us 28.19% 225.995us 225.995us 13.792us 6.55% 13.792us 13.792us 1 - aten::empty_strided 3.81% 30.581us 3.81% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.98% 208.263us 25.98% 208.263us 23.140us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.24% 17.992us 2.91% 23.301us 2.589us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.16% 9.309us 1.16% 9.309us 0.621us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.31% 10.480us 1.31% 10.480us 3.493us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.17% 9.380us 1.17% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.74% 5.910us 0.92% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.323us 159.21% 335.323us 335.323us 1 + torch_eager 14.44% 115.471us 99.40% 794.842us 794.842us 0.000us 0.00% 223.709us 223.709us 1 + aten::conv1d 0.70% 5.561us 13.80% 110.362us 36.787us 0.000us 0.00% 154.845us 51.615us 3 + aten::convolution 1.15% 9.189us 13.11% 104.801us 34.934us 0.000us 0.00% 154.845us 51.615us 3 + aten::_convolution 2.52% 20.182us 11.96% 95.612us 31.871us 0.000us 0.00% 154.845us 51.615us 3 + aten::_conv_depthwise2d 2.51% 20.101us 7.60% 60.741us 20.247us 154.845us 73.52% 154.845us 51.615us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.845us 73.52% 154.845us 51.615us 3 + aten::to 0.72% 5.750us 68.18% 545.179us 90.863us 0.000us 0.00% 68.864us 11.477us 6 + aten::_to_copy 2.77% 22.130us 67.46% 539.429us 89.905us 0.000us 0.00% 68.864us 11.477us 6 + aten::copy_ 5.86% 46.830us 60.79% 486.078us 81.013us 55.776us 26.48% 68.864us 11.477us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.416us 15.39% 32.416us 10.805us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.360us 11.09% 23.360us 7.787us 3 + Activity Buffer Request 31.66% 253.204us 31.66% 253.204us 253.204us 13.088us 6.21% 13.088us 13.088us 1 + aten::empty_strided 3.90% 31.221us 3.90% 31.221us 5.203us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.02% 208.054us 26.02% 208.054us 23.117us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.93% 15.399us 2.47% 19.760us 2.196us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.98% 7.800us 0.98% 7.800us 0.520us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.23% 9.810us 1.23% 9.810us 3.270us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.10% 8.820us 1.10% 8.820us 2.940us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.69% 5.519us 0.86% 6.899us 2.300us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 801.751us -Self CUDA time total: 210.461us +Self CPU time total: 799.662us +Self CUDA time total: 210.621us @@ -4921,29 +4703,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 7.15% 131.473us 52.77% 970.085us 970.085us 0.000us 0.00% 1.521ms 1.521ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.40% 1.421ms 1.421ms 1 - aten::to 0.36% 6.571us 37.17% 683.219us 113.870us 0.000us 0.00% 824.180us 137.363us 6 - aten::_to_copy 1.61% 29.612us 36.81% 676.648us 112.775us 0.000us 0.00% 824.180us 137.363us 6 - aten::copy_ 2.81% 51.569us 25.14% 462.051us 77.009us 718.613us 50.76% 824.180us 137.363us 6 - aten::conv1d 0.36% 6.680us 6.82% 125.423us 41.808us 0.000us 0.00% 696.981us 232.327us 3 - aten::convolution 0.57% 10.460us 6.46% 118.743us 39.581us 0.000us 0.00% 696.981us 232.327us 3 - aten::_convolution 1.31% 24.040us 5.89% 108.283us 36.094us 0.000us 0.00% 696.981us 232.327us 3 - aten::_conv_depthwise2d 1.25% 22.981us 3.69% 67.913us 22.638us 696.981us 49.24% 696.981us 232.327us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.981us 49.24% 696.981us 232.327us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.458us 29.00% 410.458us 136.819us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.155us 21.77% 308.155us 102.718us 3 - Activity Buffer Request 11.91% 218.936us 11.91% 218.936us 218.936us 105.567us 7.46% 105.567us 105.567us 1 - aten::empty_strided 2.01% 37.011us 10.06% 184.985us 30.831us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.74% 215.777us 11.74% 215.777us 23.975us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.99% 18.200us 1.31% 24.000us 2.667us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.53% 9.740us 0.53% 9.740us 0.649us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.59% 10.839us 0.59% 10.839us 3.613us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.54% 9.862us 0.54% 9.862us 3.287us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.34% 6.240us 0.42% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.62% 120.362us 52.56% 956.135us 956.135us 0.000us 0.00% 1.509ms 1.509ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.411ms 100.41% 1.411ms 1.411ms 1 + aten::to 0.34% 6.140us 38.13% 693.750us 115.625us 0.000us 0.00% 815.515us 135.919us 6 + aten::_to_copy 1.53% 27.810us 37.80% 687.610us 114.602us 0.000us 0.00% 815.515us 135.919us 6 + aten::copy_ 2.83% 51.570us 25.68% 467.247us 77.874us 711.740us 50.66% 815.515us 135.919us 6 + aten::conv1d 0.32% 5.781us 6.36% 115.702us 38.567us 0.000us 0.00% 693.278us 231.093us 3 + aten::convolution 0.51% 9.289us 6.04% 109.921us 36.640us 0.000us 0.00% 693.278us 231.093us 3 + aten::_convolution 1.19% 21.630us 5.53% 100.632us 33.544us 0.000us 0.00% 693.278us 231.093us 3 + aten::_conv_depthwise2d 1.16% 21.108us 3.52% 63.951us 21.317us 693.278us 49.34% 693.278us 231.093us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 693.278us 49.34% 693.278us 231.093us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 405.439us 28.86% 405.439us 135.146us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.301us 21.80% 306.301us 102.100us 3 + Activity Buffer Request 12.14% 220.924us 12.14% 220.924us 220.924us 103.775us 7.39% 103.775us 103.775us 1 + aten::empty_strided 1.98% 36.051us 10.58% 192.553us 32.092us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.05% 219.204us 12.05% 219.204us 24.356us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.93% 16.940us 1.22% 22.200us 2.467us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.48% 8.651us 0.48% 8.651us 0.577us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.51% 9.201us 0.51% 9.201us 3.067us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 9.191us 0.51% 9.191us 3.064us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 5.621us 0.38% 6.871us 2.290us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.838ms -Self CUDA time total: 1.416ms +Self CPU time total: 1.819ms +Self CUDA time total: 1.405ms @@ -4953,56 +4735,56 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 6.74% 124.615us 43.66% 806.720us 806.720us 0.000us 0.00% 1.502ms 1.502ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.41% 1.433ms 1.433ms 1 - aten::to 0.34% 6.269us 28.35% 523.751us 87.292us 0.000us 0.00% 764.786us 127.464us 6 - aten::_to_copy 1.27% 23.480us 28.01% 517.482us 86.247us 0.000us 0.00% 764.786us 127.464us 6 - aten::copy_ 2.74% 50.661us 25.15% 464.712us 77.452us 690.099us 48.36% 764.786us 127.464us 6 - aten::conv1d 0.32% 5.870us 7.00% 129.374us 43.125us 0.000us 0.00% 737.040us 245.680us 3 - aten::convolution 0.54% 9.999us 6.68% 123.504us 41.168us 0.000us 0.00% 737.040us 245.680us 3 - aten::_convolution 1.31% 24.293us 6.14% 113.505us 37.835us 0.000us 0.00% 737.040us 245.680us 3 - aten::_conv_depthwise2d 1.62% 30.010us 3.95% 73.060us 24.353us 737.040us 51.64% 737.040us 245.680us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.040us 51.64% 737.040us 245.680us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 399.673us 28.01% 399.673us 133.224us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.426us 20.35% 290.426us 96.809us 3 - Activity Buffer Request 12.15% 224.466us 12.15% 224.466us 224.466us 74.687us 5.23% 74.687us 74.687us 1 - aten::empty_strided 1.59% 29.290us 1.59% 29.290us 4.882us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.52% 212.785us 11.52% 212.785us 23.643us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.94% 17.281us 1.23% 22.771us 2.530us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.55% 10.081us 0.55% 10.081us 0.672us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.57% 10.440us 0.57% 10.440us 3.480us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.51% 9.410us 0.51% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.33% 6.150us 0.41% 7.641us 2.547us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.07% 112.213us 42.26% 781.792us 781.792us 0.000us 0.00% 1.498ms 1.498ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.428ms 100.39% 1.428ms 1.428ms 1 + aten::to 0.33% 6.130us 28.74% 531.749us 88.625us 0.000us 0.00% 757.569us 126.261us 6 + aten::_to_copy 1.23% 22.780us 28.41% 525.619us 87.603us 0.000us 0.00% 757.569us 126.261us 6 + aten::copy_ 2.64% 48.852us 25.56% 472.969us 78.828us 682.049us 47.95% 757.569us 126.261us 6 + aten::conv1d 0.33% 6.130us 6.13% 113.361us 37.787us 0.000us 0.00% 740.449us 246.816us 3 + aten::convolution 0.48% 8.889us 5.80% 107.231us 35.744us 0.000us 0.00% 740.449us 246.816us 3 + aten::_convolution 1.13% 20.931us 5.32% 98.342us 32.781us 0.000us 0.00% 740.449us 246.816us 3 + aten::_conv_depthwise2d 1.15% 21.330us 3.38% 62.491us 20.830us 740.449us 52.05% 740.449us 246.816us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 740.449us 52.05% 740.449us 246.816us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.857us 27.97% 397.857us 132.619us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 284.192us 19.98% 284.192us 94.731us 3 + Activity Buffer Request 12.95% 239.644us 12.95% 239.644us 239.644us 75.520us 5.31% 75.520us 75.520us 1 + aten::empty_strided 1.61% 29.870us 1.61% 29.870us 4.978us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.17% 206.574us 11.17% 206.574us 22.953us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.85% 15.779us 1.12% 20.809us 2.312us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.45% 8.409us 0.45% 8.409us 0.561us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.49% 9.120us 0.49% 9.120us 3.040us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.54% 9.940us 0.54% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.29% 5.381us 0.36% 6.700us 2.233us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.848ms -Self CUDA time total: 1.427ms +Self CPU time total: 1.850ms +Self CUDA time total: 1.422ms impl wl p50(ms) ok -torch_eager cuda_B2_D2048_S128_W2 0.09 True +torch_eager cuda_B2_D2048_S128_W2 0.08 True torch_eager cuda_B2_D2048_S128_W4 0.08 True -torch_eager cuda_B2_D2048_S2048_W2 0.15 True +torch_eager cuda_B2_D2048_S2048_W2 0.16 True torch_eager cuda_B2_D2048_S2048_W4 0.16 True -torch_eager cuda_B2_D2048_S512_W2 0.09 True -torch_eager cuda_B2_D2048_S512_W4 0.09 True +torch_eager cuda_B2_D2048_S512_W2 0.08 True +torch_eager cuda_B2_D2048_S512_W4 0.08 True torch_eager cuda_B2_D64_S128_W2 0.07 True -torch_eager cuda_B2_D64_S128_W4 0.09 True -torch_eager cuda_B2_D64_S2048_W2 0.09 True -torch_eager cuda_B2_D64_S2048_W4 0.09 True -torch_eager cuda_B2_D64_S512_W2 0.09 True -torch_eager cuda_B2_D64_S512_W4 0.09 True -torch_eager cuda_B4_D2048_S128_W2 0.09 True -torch_eager cuda_B4_D2048_S128_W4 0.09 True -torch_eager cuda_B4_D2048_S2048_W2 0.49 True +torch_eager cuda_B2_D64_S128_W4 0.08 True +torch_eager cuda_B2_D64_S2048_W2 0.08 True +torch_eager cuda_B2_D64_S2048_W4 0.08 True +torch_eager cuda_B2_D64_S512_W2 0.08 True +torch_eager cuda_B2_D64_S512_W4 0.08 True +torch_eager cuda_B4_D2048_S128_W2 0.08 True +torch_eager cuda_B4_D2048_S128_W4 0.08 True +torch_eager cuda_B4_D2048_S2048_W2 0.48 True torch_eager cuda_B4_D2048_S2048_W4 0.50 True -torch_eager cuda_B4_D2048_S512_W2 0.10 True +torch_eager cuda_B4_D2048_S512_W2 0.09 True torch_eager cuda_B4_D2048_S512_W4 0.10 True -torch_eager cuda_B4_D64_S128_W2 0.09 True +torch_eager cuda_B4_D64_S128_W2 0.08 True torch_eager cuda_B4_D64_S128_W4 0.08 True -torch_eager cuda_B4_D64_S2048_W2 0.09 True -torch_eager cuda_B4_D64_S2048_W4 0.09 True -torch_eager cuda_B4_D64_S512_W2 0.09 True -torch_eager cuda_B4_D64_S512_W4 0.09 True +torch_eager cuda_B4_D64_S2048_W2 0.08 True +torch_eager cuda_B4_D64_S2048_W4 0.08 True +torch_eager cuda_B4_D64_S512_W2 0.08 True +torch_eager cuda_B4_D64_S512_W4 0.08 TrueArtifacts: