joeddav commited on
Commit
1f77aa7
·
1 Parent(s): 83ce100

Publish WIP HF Space snapshot

Browse files
.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ node_modules
2
+ dist
3
+ .git
4
+ npm-debug.log
.gitignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+ test-results
15
+ playwright-report
16
+ tests/topology.spec.ts-snapshots
17
+
18
+ # Editor directories and files
19
+ .vscode/*
20
+ !.vscode/extensions.json
21
+ .idea
22
+ .DS_Store
23
+ *.suo
24
+ *.ntvs*
25
+ *.njsproj
26
+ *.sln
27
+ *.sw?
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:22-alpine AS base
2
+ WORKDIR /app
3
+ COPY package*.json ./
4
+ RUN npm ci
5
+
6
+ FROM base AS dev
7
+ COPY . .
8
+ EXPOSE 7860
9
+ CMD ["npm", "run", "dev"]
10
+
11
+ FROM base AS build
12
+ COPY . .
13
+ RUN npm run build
14
+
15
+ FROM node:22-alpine AS production
16
+ RUN npm install -g serve@14.2.4
17
+ USER node
18
+ ENV HOME=/home/node
19
+ WORKDIR /home/node/app
20
+ COPY --from=build --chown=node:node /app/dist ./dist
21
+ EXPOSE 7860
22
+ CMD ["serve", "-s", "dist", "-l", "7860"]
README.md CHANGED
@@ -1,12 +1,59 @@
1
  ---
2
- title: Illustrated Cluster
3
- emoji:
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: docker
 
7
  pinned: false
8
  license: mit
9
- short_description: '[WIP] Interactive visualization of an LLM training cluster'
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: "[WIP] Illustrated Training Cluster"
3
+ colorFrom: yellow
4
+ colorTo: green
 
5
  sdk: docker
6
+ app_port: 7860
7
  pinned: false
8
  license: mit
9
+ short_description: "[WIP] Interactive visualization of an LLM training cluster"
10
  ---
11
 
12
+ # [WIP] Illustrated Training Cluster
13
+
14
+ Interactive workbench for exploring how large-model training layouts map onto GPU clusters.
15
+
16
+ Current WIP scope:
17
+
18
+ - compute-backed memory, communication, and throughput estimates
19
+ - linked cluster and transformer visualizations
20
+ - editable model, cluster, training, and parallelism controls
21
+ - built-in OLMo 3 32B and Trinity Large 400B starting points
22
+
23
+ Temporary note:
24
+
25
+ - the Llama 3.1 405B example is hidden from the UI while its training recipe is being reworked
26
+
27
+ ## Stack
28
+
29
+ - React 19 + TypeScript
30
+ - Vite
31
+ - PixiJS + `@pixi/react`
32
+ - Docker for local runs and Hugging Face Spaces deployment
33
+
34
+ ## Local development
35
+
36
+ ```bash
37
+ docker compose up --build
38
+ ```
39
+
40
+ Then open [http://localhost:7860](http://localhost:7860).
41
+
42
+ ## Checks
43
+
44
+ ```bash
45
+ npm run test:unit
46
+ npm run lint
47
+ npm run build
48
+ npm run test:e2e
49
+ ```
50
+
51
+ ## Debugging and snapshots
52
+
53
+ - `?debug=1` enables the in-app debug overlay
54
+ - `?snapshot=1` freezes animation for deterministic screenshots
55
+ - `?scenario=default|olmo-pretraining|olmo-long-context|llama-pretraining|llama-long-context|trinity-pretraining|trinity-long-context|infeasible-memory`
56
+
57
+ ## Hugging Face Spaces
58
+
59
+ This repository is configured as a Docker Space. Hugging Face builds the root `Dockerfile` and serves the app on port `7860`.
compose.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ build:
4
+ context: .
5
+ target: dev
6
+ ports:
7
+ - '7860:7860'
8
+ environment:
9
+ CHOKIDAR_USEPOLLING: 'true'
10
+ volumes:
11
+ - .:/app
12
+ - /app/node_modules
eslint.config.js ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from '@eslint/js'
2
+ import globals from 'globals'
3
+ import reactHooks from 'eslint-plugin-react-hooks'
4
+ import reactRefresh from 'eslint-plugin-react-refresh'
5
+ import tseslint from 'typescript-eslint'
6
+ import { defineConfig, globalIgnores } from 'eslint/config'
7
+
8
+ export default defineConfig([
9
+ globalIgnores(['dist']),
10
+ {
11
+ files: ['**/*.{ts,tsx}'],
12
+ extends: [
13
+ js.configs.recommended,
14
+ tseslint.configs.recommended,
15
+ reactHooks.configs.flat.recommended,
16
+ reactRefresh.configs.vite,
17
+ ],
18
+ languageOptions: {
19
+ ecmaVersion: 2020,
20
+ globals: globals.browser,
21
+ },
22
+ },
23
+ ])
index.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/vite.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>[WIP] Illustrated Training Cluster</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.tsx"></script>
12
+ </body>
13
+ </html>
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cluster-topology-viz",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "tsc -b && vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview",
11
+ "check": "npm run lint && npm run build",
12
+ "test:unit": "vitest run",
13
+ "test:e2e": "npm run build && playwright test",
14
+ "test:e2e:update": "npm run build && playwright test --update-snapshots"
15
+ },
16
+ "dependencies": {
17
+ "@fontsource/ibm-plex-mono": "^5.2.7",
18
+ "@fontsource/space-grotesk": "^5.2.10",
19
+ "@pixi/react": "^8.0.5",
20
+ "pixi.js": "^8.16.0",
21
+ "react": "^19.2.0",
22
+ "react-dom": "^19.2.0"
23
+ },
24
+ "devDependencies": {
25
+ "@eslint/js": "^9.39.1",
26
+ "@playwright/test": "^1.58.2",
27
+ "@types/node": "^24.10.1",
28
+ "@types/react": "^19.2.7",
29
+ "@types/react-dom": "^19.2.3",
30
+ "@vitejs/plugin-react": "^5.1.1",
31
+ "eslint": "^9.39.1",
32
+ "eslint-plugin-react-hooks": "^7.0.1",
33
+ "eslint-plugin-react-refresh": "^0.4.24",
34
+ "globals": "^16.5.0",
35
+ "typescript": "~5.9.3",
36
+ "typescript-eslint": "^8.48.0",
37
+ "vite": "^7.3.1",
38
+ "vitest": "^4.0.18"
39
+ }
40
+ }
playwright.config.ts ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig, devices } from '@playwright/test'
2
+
3
+ export default defineConfig({
4
+ testDir: './tests',
5
+ testMatch: /.*\.spec\.ts/,
6
+ fullyParallel: false,
7
+ retries: 0,
8
+ reporter: 'list',
9
+ workers: 1,
10
+ use: {
11
+ baseURL: 'http://127.0.0.1:4173',
12
+ trace: 'on-first-retry',
13
+ viewport: {
14
+ width: 1600,
15
+ height: 1100,
16
+ },
17
+ },
18
+ projects: [
19
+ {
20
+ name: 'chromium',
21
+ use: {
22
+ ...devices['Desktop Chrome'],
23
+ viewport: {
24
+ width: 1600,
25
+ height: 1100,
26
+ },
27
+ },
28
+ },
29
+ ],
30
+ webServer: {
31
+ command: 'npm run preview -- --host 127.0.0.1 --port 4173',
32
+ port: 4173,
33
+ reuseExistingServer: true,
34
+ timeout: 120000,
35
+ },
36
+ })
src/App.css ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .workbench-shell {
2
+ max-width: 1680px;
3
+ margin: 0 auto;
4
+ padding: 18px;
5
+ }
6
+
7
+ .mini-label {
8
+ margin: 0 0 6px;
9
+ color: var(--accent-cool);
10
+ font-family: var(--font-mono);
11
+ font-size: 0.72rem;
12
+ letter-spacing: 0.13em;
13
+ text-transform: uppercase;
14
+ }
15
+
16
+ .app-topbar {
17
+ display: grid;
18
+ gap: 14px;
19
+ margin-bottom: 14px;
20
+ }
21
+
22
+ .title-block {
23
+ display: grid;
24
+ gap: 4px;
25
+ }
26
+
27
+ .title-block h1 {
28
+ margin: 0;
29
+ color: var(--ink-strong);
30
+ font-size: clamp(1.7rem, 2vw, 2.2rem);
31
+ line-height: 1;
32
+ letter-spacing: -0.04em;
33
+ }
34
+
35
+ .title-copy {
36
+ margin: 0;
37
+ max-width: 88ch;
38
+ color: var(--ink-soft);
39
+ font-size: 0.98rem;
40
+ }
41
+
42
+ .summary-strip {
43
+ display: grid;
44
+ grid-template-columns: minmax(260px, 1.8fr) repeat(4, minmax(0, 1fr));
45
+ gap: 10px;
46
+ }
47
+
48
+ .summary-card,
49
+ .controls-band,
50
+ .map-panel,
51
+ .side-card {
52
+ border: 1px solid var(--panel-stroke);
53
+ background: rgba(253, 252, 248, 0.92);
54
+ box-shadow: 0 12px 28px rgba(19, 42, 51, 0.06);
55
+ }
56
+
57
+ .summary-card {
58
+ min-height: 88px;
59
+ padding: 12px 14px;
60
+ border-radius: 16px;
61
+ }
62
+
63
+ .summary-card span,
64
+ .fact-row span,
65
+ .inspector-grid dt {
66
+ display: block;
67
+ color: var(--ink-muted);
68
+ font-size: 0.76rem;
69
+ text-transform: uppercase;
70
+ letter-spacing: 0.08em;
71
+ }
72
+
73
+ .summary-card strong,
74
+ .fact-row strong,
75
+ .inspector-grid dd {
76
+ display: block;
77
+ margin-top: 6px;
78
+ color: var(--ink-strong);
79
+ font-size: 1.15rem;
80
+ line-height: 1.05;
81
+ }
82
+
83
+ .summary-card p {
84
+ margin: 8px 0 0;
85
+ color: var(--ink-soft);
86
+ font-size: 0.9rem;
87
+ }
88
+
89
+ .summary-card-wide strong {
90
+ font-size: 1.25rem;
91
+ }
92
+
93
+ .controls-band {
94
+ padding: 12px 14px 14px;
95
+ border-radius: 16px;
96
+ }
97
+
98
+ .controls-head {
99
+ display: flex;
100
+ justify-content: space-between;
101
+ gap: 12px;
102
+ align-items: flex-end;
103
+ margin-bottom: 12px;
104
+ }
105
+
106
+ .controls-head h2,
107
+ .topology-header h2,
108
+ .side-header h3 {
109
+ margin: 0;
110
+ color: var(--ink-strong);
111
+ font-size: 1.1rem;
112
+ }
113
+
114
+ .controls-meta {
115
+ display: flex;
116
+ flex-wrap: wrap;
117
+ gap: 8px;
118
+ align-items: center;
119
+ }
120
+
121
+ .controls-meta span,
122
+ .reset-chip,
123
+ .scene-button {
124
+ padding: 7px 10px;
125
+ border-radius: 999px;
126
+ border: 1px solid rgba(19, 58, 80, 0.09);
127
+ background: rgba(246, 244, 238, 0.92);
128
+ color: var(--ink-soft);
129
+ font-family: var(--font-mono);
130
+ font-size: 0.78rem;
131
+ }
132
+
133
+ .reset-chip,
134
+ .scene-button {
135
+ color: var(--accent-warm);
136
+ }
137
+
138
+ .controls-grid {
139
+ display: grid;
140
+ grid-template-columns: repeat(4, minmax(0, 1fr));
141
+ gap: 10px;
142
+ }
143
+
144
+ .controls-stack {
145
+ display: grid;
146
+ gap: 10px;
147
+ }
148
+
149
+ .controls-grid-parallelism {
150
+ grid-template-columns: repeat(5, minmax(0, 1fr));
151
+ }
152
+
153
+ .control-card {
154
+ border: 1px solid rgba(19, 58, 80, 0.08);
155
+ border-radius: 14px;
156
+ padding: 11px 12px;
157
+ background: rgba(250, 248, 242, 0.96);
158
+ }
159
+
160
+ .field-grid {
161
+ display: grid;
162
+ grid-template-columns: repeat(2, minmax(0, 1fr));
163
+ gap: 10px;
164
+ }
165
+
166
+ .field-grid-wide {
167
+ grid-template-columns: repeat(3, minmax(0, 1fr));
168
+ }
169
+
170
+ .control-card-header {
171
+ display: flex;
172
+ justify-content: space-between;
173
+ gap: 10px;
174
+ align-items: center;
175
+ margin-bottom: 10px;
176
+ }
177
+
178
+ .control-card-header h3 {
179
+ margin: 0;
180
+ color: var(--ink-strong);
181
+ font-size: 1rem;
182
+ }
183
+
184
+ .control-card-header p {
185
+ margin: 2px 0 0;
186
+ color: var(--ink-soft);
187
+ font-size: 0.84rem;
188
+ }
189
+
190
+ .control-badge {
191
+ padding: 5px 8px;
192
+ border-radius: 999px;
193
+ background: rgba(17, 122, 112, 0.1);
194
+ color: var(--accent-cool);
195
+ font-family: var(--font-mono);
196
+ font-size: 0.76rem;
197
+ }
198
+
199
+ .control-field {
200
+ display: grid;
201
+ gap: 6px;
202
+ }
203
+
204
+ .control-field span,
205
+ .control-toggle span {
206
+ color: var(--ink-muted);
207
+ font-size: 0.76rem;
208
+ letter-spacing: 0.06em;
209
+ text-transform: uppercase;
210
+ }
211
+
212
+ .control-field input,
213
+ .control-field select {
214
+ width: 100%;
215
+ padding: 8px 10px;
216
+ border: 1px solid rgba(19, 58, 80, 0.12);
217
+ border-radius: 10px;
218
+ background: #fffdf8;
219
+ color: var(--ink-strong);
220
+ }
221
+
222
+ .control-field-toggle {
223
+ align-self: end;
224
+ }
225
+
226
+ .control-toggle {
227
+ display: inline-flex;
228
+ align-items: center;
229
+ gap: 8px;
230
+ min-height: 40px;
231
+ padding: 8px 10px;
232
+ border: 1px solid rgba(19, 58, 80, 0.12);
233
+ border-radius: 10px;
234
+ background: #fffdf8;
235
+ }
236
+
237
+ .control-toggle input {
238
+ accent-color: var(--accent-cool);
239
+ }
240
+
241
+ .option-strip {
242
+ display: flex;
243
+ flex-wrap: wrap;
244
+ gap: 6px;
245
+ }
246
+
247
+ .option-chip {
248
+ min-width: 38px;
249
+ padding: 7px 9px;
250
+ border: 1px solid rgba(19, 58, 80, 0.12);
251
+ border-radius: 10px;
252
+ background: #fffdf8;
253
+ color: var(--ink-soft);
254
+ font-weight: 500;
255
+ transition:
256
+ background-color 150ms ease,
257
+ border-color 150ms ease,
258
+ transform 150ms ease;
259
+ }
260
+
261
+ .option-chip:hover {
262
+ transform: translateY(-1px);
263
+ }
264
+
265
+ .option-chip.active {
266
+ border-color: rgba(17, 122, 112, 0.26);
267
+ background: rgba(225, 246, 241, 0.96);
268
+ color: var(--accent-cool);
269
+ }
270
+
271
+ .analysis-stack {
272
+ display: grid;
273
+ gap: 14px;
274
+ margin-top: 14px;
275
+ align-items: start;
276
+ }
277
+
278
+ .status-banner {
279
+ display: flex;
280
+ gap: 10px;
281
+ align-items: center;
282
+ padding: 10px 12px;
283
+ border-radius: 14px;
284
+ border: 1px solid rgba(214, 98, 37, 0.16);
285
+ background: rgba(255, 245, 236, 0.96);
286
+ color: var(--ink-soft);
287
+ }
288
+
289
+ .status-banner strong {
290
+ color: var(--accent-warm);
291
+ }
292
+
293
+ .map-panel {
294
+ padding: 12px;
295
+ border-radius: 16px;
296
+ }
297
+
298
+ .topology-header {
299
+ display: flex;
300
+ justify-content: space-between;
301
+ gap: 12px;
302
+ align-items: flex-end;
303
+ margin-bottom: 10px;
304
+ }
305
+
306
+ .topology-header-actions {
307
+ display: flex;
308
+ flex-wrap: wrap;
309
+ gap: 10px;
310
+ align-items: center;
311
+ justify-content: flex-end;
312
+ }
313
+
314
+ .topology-scene-shell {
315
+ display: grid;
316
+ gap: 10px;
317
+ }
318
+
319
+ .scene-toolbar {
320
+ display: flex;
321
+ justify-content: flex-end;
322
+ gap: 10px;
323
+ align-items: center;
324
+ }
325
+
326
+ .scene-toolbar-actions {
327
+ display: flex;
328
+ gap: 8px;
329
+ flex-wrap: wrap;
330
+ }
331
+
332
+ .pixi-surface-wrap {
333
+ position: relative;
334
+ width: 100%;
335
+ overflow: hidden;
336
+ border-radius: 18px;
337
+ background:
338
+ radial-gradient(circle at 10% 10%, rgba(24, 155, 141, 0.14), transparent 22%),
339
+ radial-gradient(circle at 100% 0%, rgba(255, 175, 111, 0.16), transparent 24%),
340
+ linear-gradient(180deg, #0f202d 0%, #08141d 100%);
341
+ }
342
+
343
+ .topology-surface-wrap {
344
+ min-height: 760px;
345
+ height: min(76vh, 980px);
346
+ user-select: none;
347
+ touch-action: none;
348
+ overscroll-behavior: contain;
349
+ }
350
+
351
+ .topology-interaction-layer {
352
+ position: absolute;
353
+ inset: 0;
354
+ z-index: 1;
355
+ background: rgba(0, 0, 0, 0.001);
356
+ cursor: grab;
357
+ touch-action: none;
358
+ overscroll-behavior: contain;
359
+ }
360
+
361
+ .topology-interaction-layer.is-dragging {
362
+ cursor: grabbing;
363
+ }
364
+
365
+ .pixi-surface,
366
+ .pixi-canvas,
367
+ .pixi-surface canvas {
368
+ display: block;
369
+ width: 100%;
370
+ height: 100%;
371
+ }
372
+
373
+ .scene-inspector,
374
+ .scene-debug-panel {
375
+ position: absolute;
376
+ z-index: 2;
377
+ max-width: min(320px, calc(100% - 32px));
378
+ border: 1px solid rgba(255, 255, 255, 0.08);
379
+ border-radius: 16px;
380
+ backdrop-filter: blur(14px);
381
+ pointer-events: auto;
382
+ }
383
+
384
+ .scene-inspector {
385
+ left: 16px;
386
+ bottom: 16px;
387
+ padding: 12px 14px;
388
+ background: rgba(7, 19, 29, 0.78);
389
+ color: rgba(229, 241, 246, 0.92);
390
+ }
391
+
392
+ .scene-inspector .mini-label {
393
+ color: rgba(135, 244, 226, 0.82);
394
+ }
395
+
396
+ .scene-inspector h3 {
397
+ margin: 0;
398
+ font-size: 1rem;
399
+ }
400
+
401
+ .inspector-subheading {
402
+ margin: 6px 0 0;
403
+ color: rgba(179, 201, 211, 0.82);
404
+ font-size: 0.88rem;
405
+ }
406
+
407
+ .inspector-link-note {
408
+ margin: 10px 0 0;
409
+ color: rgba(255, 223, 161, 0.9);
410
+ font-size: 0.82rem;
411
+ }
412
+
413
+ .inspector-grid {
414
+ display: grid;
415
+ grid-template-columns: repeat(2, minmax(0, 1fr));
416
+ gap: 10px 14px;
417
+ margin: 12px 0 0;
418
+ }
419
+
420
+ .inspector-grid div {
421
+ margin: 0;
422
+ }
423
+
424
+ .inspector-grid dt {
425
+ color: rgba(160, 188, 200, 0.78);
426
+ font-size: 0.68rem;
427
+ }
428
+
429
+ .inspector-grid dd {
430
+ margin: 4px 0 0;
431
+ color: rgba(243, 250, 252, 0.96);
432
+ font-size: 0.95rem;
433
+ }
434
+
435
+ .scene-debug-panel {
436
+ top: 16px;
437
+ right: 16px;
438
+ padding: 12px 14px;
439
+ background: rgba(10, 20, 31, 0.82);
440
+ color: rgba(225, 238, 244, 0.92);
441
+ }
442
+
443
+ .scene-debug-panel .mini-label {
444
+ color: rgba(255, 221, 156, 0.82);
445
+ }
446
+
447
+ .debug-toggle-grid {
448
+ display: grid;
449
+ gap: 8px;
450
+ }
451
+
452
+ .debug-toggle-grid label {
453
+ display: flex;
454
+ align-items: center;
455
+ gap: 8px;
456
+ color: rgba(228, 240, 245, 0.92);
457
+ font-size: 0.86rem;
458
+ }
459
+
460
+ .debug-toggle-grid input {
461
+ accent-color: var(--accent-warm);
462
+ }
463
+
464
+ .debug-stats {
465
+ display: flex;
466
+ flex-wrap: wrap;
467
+ gap: 8px;
468
+ margin-top: 12px;
469
+ }
470
+
471
+ .debug-stats span {
472
+ padding: 5px 8px;
473
+ border-radius: 999px;
474
+ background: rgba(255, 255, 255, 0.06);
475
+ color: rgba(235, 245, 248, 0.9);
476
+ font-family: var(--font-mono);
477
+ font-size: 0.74rem;
478
+ }
479
+
480
+ .side-column {
481
+ display: grid;
482
+ gap: 14px;
483
+ }
484
+
485
+ .side-card {
486
+ padding: 12px;
487
+ border-radius: 16px;
488
+ }
489
+
490
+ .side-header {
491
+ margin-bottom: 12px;
492
+ }
493
+
494
+ .facts-grid {
495
+ display: grid;
496
+ gap: 10px;
497
+ }
498
+
499
+ .fact-row {
500
+ padding-bottom: 10px;
501
+ border-bottom: 1px solid rgba(19, 58, 80, 0.08);
502
+ }
503
+
504
+ .fact-row:last-child {
505
+ padding-bottom: 0;
506
+ border-bottom: 0;
507
+ }
508
+
509
+ .warning-list {
510
+ display: grid;
511
+ gap: 8px;
512
+ margin-top: 12px;
513
+ }
514
+
515
+ .warning-pill {
516
+ border-left: 3px solid rgba(214, 98, 37, 0.74);
517
+ border-radius: 10px;
518
+ padding: 9px 10px;
519
+ background: rgba(255, 244, 232, 0.92);
520
+ color: var(--ink-soft);
521
+ font-size: 0.88rem;
522
+ }
523
+
524
+ .fullscreen-overlay {
525
+ position: fixed;
526
+ inset: 0;
527
+ z-index: 40;
528
+ display: grid;
529
+ place-items: center;
530
+ padding: 20px;
531
+ background: rgba(4, 12, 20, 0.72);
532
+ backdrop-filter: blur(10px);
533
+ }
534
+
535
+ .fullscreen-shell {
536
+ display: grid;
537
+ gap: 12px;
538
+ width: min(1600px, 100%);
539
+ max-height: calc(100vh - 40px);
540
+ padding: 14px;
541
+ border: 1px solid rgba(255, 255, 255, 0.08);
542
+ border-radius: 22px;
543
+ background: rgba(252, 250, 245, 0.98);
544
+ box-shadow: 0 24px 80px rgba(4, 12, 20, 0.38);
545
+ }
546
+
547
+ .fullscreen-toolbar {
548
+ display: flex;
549
+ justify-content: space-between;
550
+ gap: 12px;
551
+ align-items: flex-end;
552
+ }
553
+
554
+ .fullscreen-toolbar h2 {
555
+ margin: 0;
556
+ color: var(--ink-strong);
557
+ font-size: 1.2rem;
558
+ }
559
+
560
+ .fullscreen-content {
561
+ min-height: 0;
562
+ overflow: auto;
563
+ }
564
+
565
+ .fullscreen-content .map-panel {
566
+ min-height: calc(100vh - 168px);
567
+ }
568
+
569
+ .fullscreen-content .topology-surface-wrap {
570
+ height: calc(100vh - 290px);
571
+ min-height: 680px;
572
+ }
573
+
574
+ @media (max-width: 1400px) {
575
+ .summary-strip {
576
+ grid-template-columns: repeat(2, minmax(0, 1fr));
577
+ }
578
+
579
+ .fullscreen-shell {
580
+ width: 100%;
581
+ }
582
+ }
583
+
584
+ @media (max-width: 1040px) {
585
+ .controls-grid,
586
+ .controls-grid-parallelism,
587
+ .field-grid,
588
+ .field-grid-wide {
589
+ grid-template-columns: repeat(2, minmax(0, 1fr));
590
+ }
591
+
592
+ .controls-head,
593
+ .topology-header,
594
+ .scene-toolbar,
595
+ .fullscreen-toolbar {
596
+ flex-direction: column;
597
+ align-items: flex-start;
598
+ }
599
+ }
600
+
601
+ @media (max-width: 760px) {
602
+ .workbench-shell {
603
+ padding: 12px;
604
+ }
605
+
606
+ .summary-strip,
607
+ .controls-grid,
608
+ .controls-grid-parallelism,
609
+ .field-grid,
610
+ .field-grid-wide,
611
+ .inspector-grid {
612
+ grid-template-columns: 1fr;
613
+ }
614
+
615
+ .topology-surface-wrap {
616
+ min-height: 560px;
617
+ height: 64vh;
618
+ }
619
+
620
+ .fullscreen-overlay {
621
+ padding: 10px;
622
+ }
623
+
624
+ .fullscreen-shell {
625
+ max-height: calc(100vh - 20px);
626
+ padding: 10px;
627
+ }
628
+
629
+ .fullscreen-content .topology-surface-wrap,
630
+ .fullscreen-content .topology-surface-wrap {
631
+ min-height: 420px;
632
+ height: 62vh;
633
+ }
634
+
635
+ .scene-inspector,
636
+ .scene-debug-panel {
637
+ position: static;
638
+ max-width: none;
639
+ margin: 10px;
640
+ }
641
+ }
src/App.tsx ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useMemo, useState } from 'react'
2
+ import './App.css'
3
+ import { ClusterMap } from './components/ClusterMap'
4
+ import { ControlsPanel } from './components/ControlsPanel'
5
+ import { analyzeCluster } from './lib/trainingClusterModel'
6
+ import { getScenarioConfig, getViewOptions } from './lib/viewOptions'
7
+ import { buildWorkbenchViewModel } from './lib/workbenchPresenter'
8
+ import { type WorkbenchConfig } from './lib/workbench'
9
+
10
+ function App() {
11
+ const viewOptions = getViewOptions()
12
+ const [config, setConfig] = useState<WorkbenchConfig>(() =>
13
+ getScenarioConfig(viewOptions.scenario),
14
+ )
15
+ const [expandedView, setExpandedView] = useState<'cluster' | null>(null)
16
+
17
+ const analysis = useMemo(
18
+ () => analyzeCluster(config.model, config.training, config.cluster, config.parallelism),
19
+ [config],
20
+ )
21
+ const viewModel = useMemo(
22
+ () => buildWorkbenchViewModel(config, analysis),
23
+ [analysis, config],
24
+ )
25
+
26
+ useEffect(() => {
27
+ if (!expandedView) {
28
+ return undefined
29
+ }
30
+
31
+ const previousOverflow = document.body.style.overflow
32
+ document.body.style.overflow = 'hidden'
33
+
34
+ const handleKeyDown = (event: KeyboardEvent) => {
35
+ if (event.key === 'Escape') {
36
+ setExpandedView(null)
37
+ }
38
+ }
39
+
40
+ window.addEventListener('keydown', handleKeyDown)
41
+
42
+ return () => {
43
+ document.body.style.overflow = previousOverflow
44
+ window.removeEventListener('keydown', handleKeyDown)
45
+ }
46
+ }, [expandedView])
47
+
48
+ const handleConfigChange = (nextConfig: WorkbenchConfig) => {
49
+ setConfig(nextConfig)
50
+ }
51
+
52
+ const handleReset = () => {
53
+ setConfig(getScenarioConfig(viewOptions.scenario))
54
+ }
55
+
56
+ const clusterView = (
57
+ <section className="map-panel">
58
+ <div className="topology-header">
59
+ <div>
60
+ <p className="mini-label">Live cluster topology</p>
61
+ <h2>GPU fabric map</h2>
62
+ </div>
63
+
64
+ <div className="topology-header-actions">
65
+ <button
66
+ type="button"
67
+ className="scene-button"
68
+ onClick={() => setExpandedView('cluster')}
69
+ >
70
+ open full screen
71
+ </button>
72
+ </div>
73
+ </div>
74
+
75
+ <ClusterMap
76
+ viewModel={viewModel}
77
+ debugEnabled={viewOptions.debug}
78
+ snapshotMode={viewOptions.snapshot}
79
+ linkedFocus={null}
80
+ />
81
+ </section>
82
+ )
83
+
84
+ return (
85
+ <div className="workbench-shell">
86
+ <header className="app-topbar">
87
+ <div className="title-block">
88
+ <p className="mini-label">Illustrated training cluster</p>
89
+ <h1>[WIP] Parallelism workbench</h1>
90
+ <p className="title-copy">{viewModel.subheadline}</p>
91
+ </div>
92
+
93
+ {!analysis.feasible ? (
94
+ <div className="status-banner status-banner-danger" data-testid="infeasible-banner">
95
+ <strong>Infeasible configuration</strong>
96
+ <span>{analysis.infeasibilityReason}</span>
97
+ </div>
98
+ ) : null}
99
+
100
+ <section className="summary-strip" aria-label="simulation summary">
101
+ <div className="summary-card summary-card-wide">
102
+ <span>Scenario</span>
103
+ <strong>{viewModel.headline}</strong>
104
+ <p>
105
+ {config.cluster.numNodes} {config.cluster.nodeLabel ?? 'nodes'} · {config.cluster.gpuType.name}
106
+ {' · '}
107
+ {config.model.numLayers} layers · hidden {config.model.hiddenDim.toLocaleString()}
108
+ </p>
109
+ </div>
110
+ <div className="summary-card">
111
+ <span>Throughput</span>
112
+ <strong>{viewModel.summary.throughputLabel}</strong>
113
+ <p>{viewModel.summary.throughputNote}</p>
114
+ </div>
115
+ <div className="summary-card">
116
+ <span>Active GPUs</span>
117
+ <strong>{viewModel.summary.gpuLabel}</strong>
118
+ <p>{viewModel.summary.gpuNote}</p>
119
+ </div>
120
+ <div className="summary-card">
121
+ <span>Interconnect</span>
122
+ <strong>{viewModel.summary.interconnectLabel}</strong>
123
+ <p>{viewModel.summary.interconnectNote}</p>
124
+ </div>
125
+ <div className="summary-card">
126
+ <span>Bottleneck</span>
127
+ <strong>{viewModel.summary.bottleneckLabel}</strong>
128
+ <p>{viewModel.summary.bottleneckNote}</p>
129
+ </div>
130
+ </section>
131
+ </header>
132
+
133
+ <ControlsPanel
134
+ config={config}
135
+ onChange={handleConfigChange}
136
+ onReset={handleReset}
137
+ viewModel={viewModel}
138
+ />
139
+
140
+ <main className="analysis-stack">
141
+ {expandedView !== 'cluster' ? clusterView : null}
142
+
143
+ <section className="side-card">
144
+ <div className="side-header">
145
+ <p className="mini-label">Run breakdown</p>
146
+ <h3>{config.cluster.gpuType.name}</h3>
147
+ </div>
148
+
149
+ <div className="facts-grid">
150
+ {viewModel.facts.map((fact) => (
151
+ <div key={fact.label} className="fact-row">
152
+ <span>{fact.label}</span>
153
+ <strong>{fact.value}</strong>
154
+ </div>
155
+ ))}
156
+ </div>
157
+
158
+ <div className="warning-list" aria-live="polite">
159
+ {viewModel.warnings.map((warning) => (
160
+ <div key={warning} className="warning-pill">
161
+ {warning}
162
+ </div>
163
+ ))}
164
+ </div>
165
+ </section>
166
+ </main>
167
+
168
+ {expandedView ? (
169
+ <div
170
+ className="fullscreen-overlay"
171
+ role="dialog"
172
+ aria-modal="true"
173
+ onClick={(event) => {
174
+ if (event.target === event.currentTarget) {
175
+ setExpandedView(null)
176
+ }
177
+ }}
178
+ >
179
+ <div className="fullscreen-shell">
180
+ <div className="fullscreen-toolbar">
181
+ <div>
182
+ <p className="mini-label">Expanded view</p>
183
+ <h2>GPU fabric map</h2>
184
+ </div>
185
+
186
+ <button
187
+ type="button"
188
+ className="scene-button"
189
+ onClick={() => setExpandedView(null)}
190
+ >
191
+ close full screen
192
+ </button>
193
+ </div>
194
+
195
+ <div className="fullscreen-content">
196
+ {clusterView}
197
+ </div>
198
+ </div>
199
+ </div>
200
+ ) : null}
201
+ </div>
202
+ )
203
+ }
204
+
205
+ export default App
src/components/ClusterMap.tsx ADDED
@@ -0,0 +1,2086 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ useApplication,
3
+ useExtend,
4
+ useTick,
5
+ } from '@pixi/react'
6
+ import {
7
+ Container,
8
+ Graphics,
9
+ Text,
10
+ Ticker,
11
+ type Graphics as PixiGraphics,
12
+ } from 'pixi.js'
13
+ import {
14
+ useCallback,
15
+ useEffect,
16
+ useMemo,
17
+ useRef,
18
+ useState,
19
+ type PointerEvent as ReactPointerEvent,
20
+ } from 'react'
21
+ import { PixiSurface } from './pixi/PixiSurface'
22
+ import {
23
+ buildTopologySceneModel,
24
+ describeTarget,
25
+ findHoverTarget,
26
+ getFitViewport,
27
+ worldToScreen,
28
+ type HoverTarget,
29
+ type SceneGpu,
30
+ type SceneNode,
31
+ type TargetDetails,
32
+ type TopologySceneModel,
33
+ type ViewportState,
34
+ } from '../lib/topologyScene'
35
+ import { matchesLinkedFocus, type LinkedFocus } from '../lib/linkedFocus'
36
+ import { type WorkbenchViewModel } from '../lib/workbenchPresenter'
37
+ import {
38
+ TOPOLOGY_LOD_POLICY,
39
+ getTopologyLodState,
40
+ mix,
41
+ screenStroke,
42
+ screenWorld,
43
+ type TopologyLodState,
44
+ } from '../lib/topologyLod'
45
+
46
+ type ClusterMapProps = {
47
+ viewModel: WorkbenchViewModel
48
+ debugEnabled: boolean
49
+ snapshotMode: boolean
50
+ linkedFocus: LinkedFocus | null
51
+ }
52
+
53
+ type DebugToggles = {
54
+ bounds: boolean
55
+ ids: boolean
56
+ heat: boolean
57
+ hitAreas: boolean
58
+ stats: boolean
59
+ }
60
+
61
+ type ScenePointer = {
62
+ x: number
63
+ y: number
64
+ }
65
+
66
+ type DebugObjectMap = Record<
67
+ string,
68
+ {
69
+ x: number
70
+ y: number
71
+ width: number
72
+ height: number
73
+ }
74
+ >
75
+
76
+ const MIN_SCALE = TOPOLOGY_LOD_POLICY.minScale
77
+ const MAX_SCALE = TOPOLOGY_LOD_POLICY.maxScale
78
+
79
+ const clamp = (value: number, min: number, max: number) =>
80
+ Math.min(Math.max(value, min), max)
81
+
82
+ type ViewportConstraints = {
83
+ minScale: number
84
+ maxScale: number
85
+ minX: number
86
+ maxX: number
87
+ minY: number
88
+ maxY: number
89
+ }
90
+
91
+ const getViewportConstraints = (
92
+ model: TopologySceneModel,
93
+ width: number,
94
+ height: number,
95
+ scale: number,
96
+ ): ViewportConstraints => {
97
+ const fitViewport = getFitViewport(model, width, height)
98
+ const minScale = fitViewport.scale
99
+ const maxScale = clamp(Math.max(minScale * 180, minScale + 0.001), minScale, MAX_SCALE)
100
+ const safeScale = clamp(scale, minScale, maxScale)
101
+ const scaledWidth = model.width * safeScale
102
+ const scaledHeight = model.height * safeScale
103
+ const centeredX = (width - scaledWidth) / 2
104
+ const centeredY = (height - scaledHeight) / 2
105
+
106
+ if (scaledWidth <= width) {
107
+ return {
108
+ minScale,
109
+ maxScale,
110
+ minX: centeredX,
111
+ maxX: centeredX,
112
+ minY: scaledHeight <= height ? centeredY : height - scaledHeight,
113
+ maxY: scaledHeight <= height ? centeredY : 0,
114
+ }
115
+ }
116
+
117
+ if (scaledHeight <= height) {
118
+ return {
119
+ minScale,
120
+ maxScale,
121
+ minX: width - scaledWidth,
122
+ maxX: 0,
123
+ minY: centeredY,
124
+ maxY: centeredY,
125
+ }
126
+ }
127
+
128
+ return {
129
+ minScale,
130
+ maxScale,
131
+ minX: width - scaledWidth,
132
+ maxX: 0,
133
+ minY: height - scaledHeight,
134
+ maxY: 0,
135
+ }
136
+ }
137
+
138
+ const clampViewportToScene = (
139
+ nextViewport: ViewportState,
140
+ model: TopologySceneModel,
141
+ width: number,
142
+ height: number,
143
+ ): ViewportState => {
144
+ if (width <= 0 || height <= 0) {
145
+ return nextViewport
146
+ }
147
+
148
+ const constraints = getViewportConstraints(model, width, height, nextViewport.scale)
149
+ const scale = clamp(nextViewport.scale, constraints.minScale, constraints.maxScale)
150
+ const clamped = getViewportConstraints(model, width, height, scale)
151
+
152
+ return {
153
+ scale,
154
+ x: clamp(nextViewport.x, clamped.minX, clamped.maxX),
155
+ y: clamp(nextViewport.y, clamped.minY, clamped.maxY),
156
+ }
157
+ }
158
+
159
+ const noopDraw = (graphics: PixiGraphics) => {
160
+ graphics.clear()
161
+ }
162
+
163
+ const pulse = (timeMs: number, offset: number, depth: number) =>
164
+ 1 + Math.sin(timeMs / 1000 * 1.8 + offset) * depth
165
+
166
+ const drawCornerFocus = (
167
+ graphics: PixiGraphics,
168
+ bounds: { x: number; y: number; width: number; height: number },
169
+ scale: number,
170
+ color: number,
171
+ alpha: number,
172
+ lengthPx: number,
173
+ insetPx: number,
174
+ strokePx: number,
175
+ ) => {
176
+ const length = screenStroke(scale, lengthPx, 0.3, 16)
177
+ const inset = screenStroke(scale, insetPx, 0.12, 8)
178
+ const stroke = screenStroke(scale, strokePx, 0.08, 2.4)
179
+ const left = bounds.x - inset
180
+ const top = bounds.y - inset
181
+ const right = bounds.x + bounds.width + inset
182
+ const bottom = bounds.y + bounds.height + inset
183
+
184
+ graphics
185
+ .moveTo(left, top + length)
186
+ .lineTo(left, top)
187
+ .lineTo(left + length, top)
188
+ .stroke({ color, alpha, width: stroke, cap: 'square', join: 'miter' })
189
+ graphics
190
+ .moveTo(right - length, top)
191
+ .lineTo(right, top)
192
+ .lineTo(right, top + length)
193
+ .stroke({ color, alpha, width: stroke, cap: 'square', join: 'miter' })
194
+ graphics
195
+ .moveTo(left, bottom - length)
196
+ .lineTo(left, bottom)
197
+ .lineTo(left + length, bottom)
198
+ .stroke({ color, alpha, width: stroke, cap: 'square', join: 'miter' })
199
+ graphics
200
+ .moveTo(right - length, bottom)
201
+ .lineTo(right, bottom)
202
+ .lineTo(right, bottom - length)
203
+ .stroke({ color, alpha, width: stroke, cap: 'square', join: 'miter' })
204
+ }
205
+
206
+ function createDebugObjectMap(
207
+ model: TopologySceneModel,
208
+ viewport: ViewportState,
209
+ ): DebugObjectMap {
210
+ const pods = Object.fromEntries(
211
+ model.pods.map((pod) => [pod.id, worldToScreen(pod.hitBounds, viewport)]),
212
+ )
213
+ const nodes = Object.fromEntries(
214
+ model.nodes.map((node) => [node.id, worldToScreen(node.hitBounds, viewport)]),
215
+ )
216
+ const gpus = Object.fromEntries(
217
+ model.nodes
218
+ .flatMap((node) => node.gpus)
219
+ .map((gpu) => [gpu.id, worldToScreen(gpu.hitBounds, viewport)]),
220
+ )
221
+
222
+ return {
223
+ ...pods,
224
+ ...nodes,
225
+ ...gpus,
226
+ }
227
+ }
228
+
229
+ const screenRadius = (
230
+ scale: number,
231
+ pixels: number,
232
+ minWorld = 0.06,
233
+ maxWorld = 12,
234
+ ) => screenWorld(scale, pixels, minWorld, maxWorld)
235
+
236
+ const makeRect = (x: number, y: number, width: number, height: number) => ({
237
+ x,
238
+ y,
239
+ width,
240
+ height,
241
+ })
242
+
243
+ const insetRect = (
244
+ rect: { x: number; y: number; width: number; height: number },
245
+ insetX: number,
246
+ insetY: number,
247
+ ) =>
248
+ makeRect(
249
+ rect.x + insetX,
250
+ rect.y + insetY,
251
+ Math.max(rect.width - insetX * 2, 0.0001),
252
+ Math.max(rect.height - insetY * 2, 0.0001),
253
+ )
254
+
255
+ const getWorldViewportBounds = (
256
+ viewport: ViewportState,
257
+ width: number,
258
+ height: number,
259
+ paddingWorld: number,
260
+ ) =>
261
+ makeRect(
262
+ -viewport.x / viewport.scale - paddingWorld,
263
+ -viewport.y / viewport.scale - paddingWorld,
264
+ width / viewport.scale + paddingWorld * 2,
265
+ height / viewport.scale + paddingWorld * 2,
266
+ )
267
+
268
+ const rectsIntersect = (
269
+ left: { x: number; y: number; width: number; height: number },
270
+ right: { x: number; y: number; width: number; height: number },
271
+ ) =>
272
+ left.x <= right.x + right.width &&
273
+ left.x + left.width >= right.x &&
274
+ left.y <= right.y + right.height &&
275
+ left.y + left.height >= right.y
276
+
277
+ const lineBounds = (
278
+ x1: number,
279
+ y1: number,
280
+ x2: number,
281
+ y2: number,
282
+ pad: number,
283
+ ) =>
284
+ makeRect(
285
+ Math.min(x1, x2) - pad,
286
+ Math.min(y1, y2) - pad,
287
+ Math.abs(x2 - x1) + pad * 2,
288
+ Math.abs(y2 - y1) + pad * 2,
289
+ )
290
+
291
+ function drawModule(
292
+ graphics: PixiGraphics,
293
+ gpu: SceneGpu,
294
+ scale: number,
295
+ linked: boolean,
296
+ lod: TopologyLodState,
297
+ emphasis: number,
298
+ ) {
299
+ const outer = gpu.lodFrame
300
+ const projectedOuterWidth = outer.width * scale
301
+ const projectedOuterHeight = outer.height * scale
302
+ const activeLoad = gpu.active ? mix(0.42, 1, gpu.utilization) : 0
303
+ const shell = insetRect(outer, outer.width * 0.04, outer.height * 0.06)
304
+ const carrier = insetRect(shell, shell.width * 0.05, shell.height * 0.08)
305
+ const coldPlate = insetRect(carrier, carrier.width * 0.14, carrier.height * 0.18)
306
+ const packageFrame = insetRect(coldPlate, coldPlate.width * 0.1, coldPlate.height * 0.13)
307
+ const substrate = insetRect(packageFrame, packageFrame.width * 0.06, packageFrame.height * 0.1)
308
+ const interposer = insetRect(substrate, substrate.width * 0.1, substrate.height * 0.14)
309
+ const die = insetRect(interposer, interposer.width * 0.2, interposer.height * 0.2)
310
+ const dieGrid = insetRect(die, die.width * 0.04, die.height * 0.05)
311
+ const connectorStrip = makeRect(
312
+ shell.x + shell.width * 0.24,
313
+ shell.y + shell.height * 0.82,
314
+ shell.width * 0.52,
315
+ shell.height * 0.08,
316
+ )
317
+ const boardStroke = linked ? 0xffefc0 : 0xcfdbe2
318
+ const overview = Math.max(lod.weights.overview - lod.weights.board * 0.18, 0)
319
+ const board = Math.max(lod.weights.board - lod.weights.package * 0.42, 0)
320
+ const packageAlpha = Math.max(lod.weights.package - lod.weights.silicon * 0.52, 0)
321
+ const siliconAlpha = Math.max(lod.weights.silicon - lod.weights.micro * 0.4, 0)
322
+ const microAlpha = lod.weights.micro
323
+ const boardPresence = Math.max(
324
+ lod.weights.board,
325
+ lod.weights.package * 0.84,
326
+ lod.weights.silicon * 0.66,
327
+ )
328
+ const coldPlatePresence = Math.max(board * 0.7, packageAlpha * 0.88, siliconAlpha * 0.9, microAlpha * 0.8)
329
+ const shellAlpha = mix(gpu.active ? 0.84 : 0.42, gpu.active ? 0.96 : 0.56, boardPresence)
330
+ const frameAlpha = emphasis * (linked ? 0.92 : 0.56)
331
+ const boardStrokeWidth = screenStroke(scale, linked ? 1.25 : 0.9, 0.08, 0.95)
332
+ const detailStroke = screenStroke(scale, 0.6, 0.03, 0.5)
333
+ const boardCorner = screenRadius(scale, 8, 0.18, 2.6)
334
+ const innerCorner = screenRadius(scale, 5, 0.16, 2)
335
+ const dieCorner = screenRadius(scale, 4, 0.14, 1.5)
336
+ const renderCarrier = projectedOuterWidth >= 10 && projectedOuterHeight >= 8
337
+ const renderColdPlate = projectedOuterWidth >= 14 && projectedOuterHeight >= 10
338
+ const renderOverviewGlyph = overview > 0.02 && projectedOuterWidth >= 10
339
+ const renderConnectorStrip = (overview > 0.02 || board > 0.02) && projectedOuterWidth >= 15
340
+ const renderBoardTier = board > 0.03 && projectedOuterWidth >= 18
341
+ const renderPackageTier = packageAlpha > 0.04 && projectedOuterWidth >= 30
342
+ const renderSiliconTier = siliconAlpha > 0.05 && die.width * scale >= 26
343
+ const renderMicroTier = microAlpha > 0.06 && die.width * scale >= 72
344
+ const glowFrame = makeRect(
345
+ shell.x - outer.width * 0.035,
346
+ shell.y - outer.height * 0.05,
347
+ shell.width + outer.width * 0.07,
348
+ shell.height + outer.height * 0.1,
349
+ )
350
+
351
+ if (activeLoad > 0.001) {
352
+ graphics
353
+ .roundRect(
354
+ glowFrame.x,
355
+ glowFrame.y,
356
+ glowFrame.width,
357
+ glowFrame.height,
358
+ screenRadius(scale, 10, 0.22, 3),
359
+ )
360
+ .fill({
361
+ color: 0x59e7d2,
362
+ alpha:
363
+ emphasis *
364
+ mix(
365
+ projectedOuterWidth < 18 ? 0.08 : 0.04,
366
+ projectedOuterWidth < 18 ? 0.2 : 0.1,
367
+ activeLoad,
368
+ ),
369
+ })
370
+ }
371
+
372
+ graphics
373
+ .roundRect(shell.x, shell.y, shell.width, shell.height, boardCorner)
374
+ .fill({ color: gpu.active ? 0x0d1f29 : 0x0b1821, alpha: shellAlpha * emphasis })
375
+ .stroke({ color: boardStroke, alpha: frameAlpha, width: boardStrokeWidth })
376
+
377
+ if (projectedOuterWidth < 8 || projectedOuterHeight < 6) {
378
+ if (activeLoad > 0.001) {
379
+ const signalWidth = Math.min(
380
+ shell.width * 0.54,
381
+ screenWorld(scale, 5.6, 0.14, shell.width * 0.54),
382
+ )
383
+ const signalHeight = Math.min(
384
+ shell.height * 0.34,
385
+ screenWorld(scale, 2.8, 0.1, shell.height * 0.34),
386
+ )
387
+ const signalX = shell.x + (shell.width - signalWidth) / 2
388
+ const signalY = shell.y + (shell.height - signalHeight) / 2
389
+
390
+ graphics
391
+ .roundRect(
392
+ signalX,
393
+ signalY,
394
+ signalWidth,
395
+ signalHeight,
396
+ screenRadius(scale, 2.2, 0.05, 0.34),
397
+ )
398
+ .fill({
399
+ color: 0x76f1df,
400
+ alpha: emphasis * mix(0.68, 1, activeLoad),
401
+ })
402
+ }
403
+
404
+ return
405
+ }
406
+
407
+ if (projectedOuterWidth < 15 || projectedOuterHeight < 10) {
408
+ const core = insetRect(shell, shell.width * 0.3, shell.height * 0.28)
409
+ graphics
410
+ .roundRect(
411
+ core.x,
412
+ core.y,
413
+ core.width,
414
+ core.height,
415
+ screenRadius(scale, 1.8, 0.04, 0.4),
416
+ )
417
+ .fill({
418
+ color: gpu.active ? 0x6ce9d7 : 0x193843,
419
+ alpha: emphasis * (gpu.active ? mix(0.6, 0.95, activeLoad) : 0.36),
420
+ })
421
+ return
422
+ }
423
+
424
+ if (renderCarrier) {
425
+ graphics
426
+ .roundRect(carrier.x, carrier.y, carrier.width, carrier.height, innerCorner)
427
+ .fill({
428
+ color: gpu.active ? 0x112833 : 0x10202a,
429
+ alpha: mix(0.56, 0.82, boardPresence) * emphasis,
430
+ })
431
+ }
432
+
433
+ if (renderColdPlate) {
434
+ graphics
435
+ .roundRect(
436
+ coldPlate.x,
437
+ coldPlate.y,
438
+ coldPlate.width,
439
+ coldPlate.height,
440
+ screenRadius(scale, 4.5, 0.12, 1.8),
441
+ )
442
+ .fill({
443
+ color: 0x163643,
444
+ alpha:
445
+ mix(0.02, 0.34, coldPlatePresence) *
446
+ emphasis *
447
+ Math.max(1 - microAlpha * 0.24, 0.76),
448
+ })
449
+ }
450
+
451
+ if (renderConnectorStrip) {
452
+ const connectorAlpha = Math.max(overview * 0.8, board * 0.55) * emphasis * (gpu.active ? 0.84 : 0.36)
453
+ const padCount = 6
454
+ const padWidth = connectorStrip.width * 0.11
455
+ const padGap = connectorStrip.width * 0.05
456
+ const totalWidth = padCount * padWidth + (padCount - 1) * padGap
457
+ const padStart = connectorStrip.x + (connectorStrip.width - totalWidth) / 2
458
+
459
+ for (let index = 0; index < padCount; index += 1) {
460
+ const padX = padStart + index * (padWidth + padGap)
461
+ graphics
462
+ .roundRect(
463
+ padX,
464
+ connectorStrip.y,
465
+ padWidth,
466
+ connectorStrip.height,
467
+ screenRadius(scale, 2, 0.04, 0.6),
468
+ )
469
+ .fill({ color: 0xd6ba72, alpha: connectorAlpha })
470
+ }
471
+ }
472
+
473
+ if (renderOverviewGlyph) {
474
+ const moduleWindow = insetRect(carrier, carrier.width * 0.24, carrier.height * 0.26)
475
+ const dieWindow = makeRect(
476
+ moduleWindow.x + moduleWindow.width * 0.31,
477
+ moduleWindow.y + moduleWindow.height * 0.26,
478
+ moduleWindow.width * 0.38,
479
+ moduleWindow.height * 0.48,
480
+ )
481
+ graphics
482
+ .roundRect(
483
+ moduleWindow.x,
484
+ moduleWindow.y,
485
+ moduleWindow.width,
486
+ moduleWindow.height,
487
+ screenRadius(scale, 2.8, 0.06, 0.9),
488
+ )
489
+ .fill({
490
+ color: gpu.active ? 0x235560 : 0x1a3d48,
491
+ alpha: overview * emphasis * mix(gpu.active ? 0.5 : 0.42, gpu.active ? 0.82 : 0.42, activeLoad),
492
+ })
493
+
494
+ for (const x of [
495
+ moduleWindow.x + moduleWindow.width * 0.14,
496
+ moduleWindow.x + moduleWindow.width * 0.76,
497
+ ]) {
498
+ graphics
499
+ .roundRect(
500
+ x,
501
+ moduleWindow.y + moduleWindow.height * 0.28,
502
+ moduleWindow.width * 0.08,
503
+ moduleWindow.height * 0.44,
504
+ screenRadius(scale, 1.3, 0.03, 0.35),
505
+ )
506
+ .fill({
507
+ color: gpu.active ? 0xdaf08e : 0xcddd73,
508
+ alpha: overview * emphasis * mix(gpu.active ? 0.8 : 0.62, 1, activeLoad * 0.7),
509
+ })
510
+ }
511
+
512
+ graphics
513
+ .roundRect(
514
+ dieWindow.x,
515
+ dieWindow.y,
516
+ dieWindow.width,
517
+ dieWindow.height,
518
+ screenRadius(scale, 1.7, 0.03, 0.42),
519
+ )
520
+ .fill({
521
+ color: gpu.active ? 0x0b1820 : 0x081219,
522
+ alpha: overview * emphasis * mix(gpu.active ? 0.92 : 0.86, 1, activeLoad * 0.4),
523
+ })
524
+ }
525
+
526
+ if (renderBoardTier) {
527
+ graphics
528
+ .roundRect(
529
+ coldPlate.x,
530
+ coldPlate.y,
531
+ coldPlate.width,
532
+ coldPlate.height,
533
+ screenRadius(scale, 4.5, 0.1, 1.2),
534
+ )
535
+ .stroke({
536
+ color: 0x88b9c6,
537
+ alpha: board * emphasis * 0.34,
538
+ width: detailStroke,
539
+ })
540
+
541
+ const mountRadius = screenWorld(scale, 2.6, 0.03, 0.26)
542
+ const mountAlpha = board * emphasis * (gpu.active ? 0.32 : 0.14)
543
+ for (const [x, y] of [
544
+ [carrier.x + carrier.width * 0.16, carrier.y + carrier.height * 0.2],
545
+ [carrier.x + carrier.width * 0.84, carrier.y + carrier.height * 0.2],
546
+ [carrier.x + carrier.width * 0.16, carrier.y + carrier.height * 0.74],
547
+ [carrier.x + carrier.width * 0.84, carrier.y + carrier.height * 0.74],
548
+ ]) {
549
+ graphics.circle(x, y, mountRadius).fill({ color: 0x8ab7b7, alpha: mountAlpha })
550
+ }
551
+
552
+ if (activeLoad > 0.001) {
553
+ const liveZone = insetRect(coldPlate, coldPlate.width * 0.3, coldPlate.height * 0.28)
554
+ graphics
555
+ .roundRect(
556
+ liveZone.x,
557
+ liveZone.y,
558
+ liveZone.width,
559
+ liveZone.height,
560
+ screenRadius(scale, 3, 0.06, 0.8),
561
+ )
562
+ .fill({
563
+ color: 0x64e6d4,
564
+ alpha: board * emphasis * mix(0.12, 0.28, activeLoad),
565
+ })
566
+ }
567
+ }
568
+
569
+ if (renderPackageTier) {
570
+ graphics
571
+ .roundRect(packageFrame.x, packageFrame.y, packageFrame.width, packageFrame.height, innerCorner)
572
+ .stroke({ color: 0xb7c7cd, alpha: packageAlpha * emphasis * 0.8, width: detailStroke })
573
+
574
+ graphics
575
+ .roundRect(substrate.x, substrate.y, substrate.width, substrate.height, innerCorner)
576
+ .fill({ color: 0x294546, alpha: packageAlpha * emphasis * 0.34 })
577
+
578
+ graphics
579
+ .roundRect(interposer.x, interposer.y, interposer.width, interposer.height, innerCorner)
580
+ .fill({ color: 0x2a5960, alpha: packageAlpha * emphasis * 0.3 })
581
+ .stroke({ color: 0x9deedb, alpha: packageAlpha * emphasis * 0.18, width: detailStroke })
582
+
583
+ const hbmWidth = interposer.width * 0.18
584
+ const hbmHeight = interposer.height * 0.16
585
+ for (let index = 0; index < 4; index += 1) {
586
+ const hbmX = interposer.x + interposer.width * 0.04 + index * (hbmWidth + interposer.width * 0.03)
587
+ for (const y of [interposer.y + interposer.height * 0.09, interposer.y + interposer.height * 0.75]) {
588
+ graphics
589
+ .roundRect(
590
+ hbmX,
591
+ y,
592
+ hbmWidth,
593
+ hbmHeight,
594
+ screenRadius(scale, 2, 0.04, 0.45),
595
+ )
596
+ .fill({ color: 0xcfd86f, alpha: packageAlpha * emphasis * 0.7 })
597
+ }
598
+ }
599
+
600
+ graphics
601
+ .roundRect(die.x, die.y, die.width, die.height, dieCorner)
602
+ .fill({ color: 0x09161d, alpha: packageAlpha * emphasis * 0.76 })
603
+ .stroke({ color: 0x8bdacd, alpha: packageAlpha * emphasis * 0.24, width: detailStroke })
604
+ }
605
+
606
+ if (renderSiliconTier) {
607
+ graphics
608
+ .roundRect(die.x, die.y, die.width, die.height, dieCorner)
609
+ .fill({ color: 0x0c1c22, alpha: siliconAlpha * emphasis * 0.58 })
610
+
611
+ const tileColumns = 7
612
+ const tileRows = 5
613
+ const tileWidth = dieGrid.width / tileColumns
614
+ const tileHeight = dieGrid.height / tileRows
615
+ for (let row = 0; row < tileRows; row += 1) {
616
+ for (let column = 0; column < tileColumns; column += 1) {
617
+ const tileX = dieGrid.x + column * tileWidth
618
+ const tileY = dieGrid.y + row * tileHeight
619
+ const tileFill =
620
+ column === 0
621
+ ? 0xa2d8ec
622
+ : row === 0 || row === tileRows - 1
623
+ ? 0x7fb7ca
624
+ : 0xb8ece2
625
+ graphics
626
+ .roundRect(
627
+ tileX + tileWidth * 0.08,
628
+ tileY + tileHeight * 0.12,
629
+ tileWidth * 0.8,
630
+ tileHeight * 0.72,
631
+ screenRadius(scale, 1.2, 0.03, 0.26),
632
+ )
633
+ .fill({ color: tileFill, alpha: siliconAlpha * emphasis * (column === 0 ? 0.22 : 0.14) })
634
+ }
635
+ }
636
+
637
+ for (const block of [
638
+ makeRect(die.x + die.width * 0.06, die.y + die.height * 0.18, die.width * 0.14, die.height * 0.64),
639
+ makeRect(die.x + die.width * 0.78, die.y + die.height * 0.26, die.width * 0.1, die.height * 0.48),
640
+ ]) {
641
+ graphics
642
+ .roundRect(
643
+ block.x,
644
+ block.y,
645
+ block.width,
646
+ block.height,
647
+ screenRadius(scale, 1.2, 0.03, 0.3),
648
+ )
649
+ .fill({ color: 0xaee6ff, alpha: siliconAlpha * emphasis * 0.14 })
650
+ }
651
+ }
652
+
653
+ if (renderMicroTier) {
654
+ const cellColumns = 38
655
+ const cellRows = 24
656
+ const cellWidth = dieGrid.width / cellColumns
657
+ const cellHeight = dieGrid.height / cellRows
658
+ const cellAlpha = microAlpha * emphasis * 0.22
659
+ for (let row = 0; row < cellRows; row += 1) {
660
+ for (let column = 0; column < cellColumns; column += 1) {
661
+ const x = dieGrid.x + column * cellWidth
662
+ const y = dieGrid.y + row * cellHeight
663
+ const edgeZone = column < 4 || column > cellColumns - 5 || row < 2 || row > cellRows - 3
664
+ const seam = column % 6 === 0 || row % 5 === 0
665
+ const primaryColor = edgeZone
666
+ ? 0x79afbd
667
+ : seam
668
+ ? 0x91d2dc
669
+ : (row + column) % 5 === 0
670
+ ? 0xc7fff0
671
+ : (row + column) % 3 === 0
672
+ ? 0x94d9ef
673
+ : 0xafe9dc
674
+ graphics
675
+ .roundRect(
676
+ x + cellWidth * 0.12,
677
+ y + cellHeight * 0.16,
678
+ cellWidth * 0.72,
679
+ cellHeight * 0.56,
680
+ screenRadius(scale, 0.18, 0.002, 0.05),
681
+ )
682
+ .fill({ color: primaryColor, alpha: cellAlpha * (seam ? 0.58 : 1) })
683
+ }
684
+ }
685
+ }
686
+ }
687
+
688
+ function drawNodeShell(
689
+ graphics: PixiGraphics,
690
+ node: SceneNode,
691
+ scale: number,
692
+ linked: boolean,
693
+ heatEnabled: boolean,
694
+ lod: TopologyLodState,
695
+ emphasis: number,
696
+ ) {
697
+ const shellAlpha = mix(0.04, 0.14, lod.weights.board) * emphasis
698
+ const trayOutlineAlpha = mix(0.08, 0.22, lod.weights.board) * emphasis
699
+ const nodeRadius = screenRadius(scale, 18, 0.8, 10)
700
+
701
+ graphics
702
+ .roundRect(node.x, node.y, node.width, node.height, nodeRadius)
703
+ .fill({ color: 0x09131b, alpha: 0.86 })
704
+ .stroke({
705
+ color: linked ? 0xffdc8a : 0x6fd9cd,
706
+ alpha: linked ? 0.82 : trayOutlineAlpha,
707
+ width: screenStroke(scale, linked ? 1.2 : 0.7, 0.08, 0.85),
708
+ })
709
+
710
+ if (shellAlpha > 0.02) {
711
+ graphics
712
+ .roundRect(
713
+ node.x + 2.5,
714
+ node.y + 2.5,
715
+ node.width - 5,
716
+ node.height - 5,
717
+ screenRadius(scale, 14, 0.6, 8),
718
+ )
719
+ .fill({ color: 0x0b1720, alpha: shellAlpha })
720
+ }
721
+
722
+ if (heatEnabled) {
723
+ graphics
724
+ .roundRect(node.x + 6, node.y + 6, node.width - 12, node.height - 12, 8)
725
+ .fill({
726
+ color: 0xe58a43,
727
+ alpha: node.interNodeLoad * 0.08 * emphasis,
728
+ })
729
+ }
730
+ }
731
+
732
+ function drawCampusPods(
733
+ graphics: PixiGraphics,
734
+ model: TopologySceneModel,
735
+ scale: number,
736
+ lod: TopologyLodState,
737
+ visiblePods: typeof model.pods,
738
+ podEmphasis: (podId: string) => number,
739
+ ) {
740
+ const rackFabricAlpha = mix(0.02, 0.08, lod.weights.overview)
741
+
742
+ for (let row = 0; row < model.podRows; row += 1) {
743
+ const rowPods = model.pods.slice(row * model.podColumns, row * model.podColumns + model.podColumns)
744
+ if (rowPods.length < 2) {
745
+ continue
746
+ }
747
+
748
+ graphics
749
+ .moveTo(rowPods[0].centerX, rowPods[0].centerY)
750
+ .lineTo(rowPods[rowPods.length - 1].centerX, rowPods[rowPods.length - 1].centerY)
751
+ .stroke({
752
+ color: 0xf1b067,
753
+ alpha: rackFabricAlpha * Math.min(podEmphasis(rowPods[0].id), podEmphasis(rowPods[rowPods.length - 1].id)),
754
+ width: screenStroke(scale, 2.4, 0.12, 2.2),
755
+ })
756
+ }
757
+
758
+ for (let column = 0; column < model.podColumns; column += 1) {
759
+ const columnPods = model.pods.filter((_, index) => index % model.podColumns === column)
760
+ if (columnPods.length < 2) {
761
+ continue
762
+ }
763
+
764
+ graphics
765
+ .moveTo(columnPods[0].centerX, columnPods[0].centerY)
766
+ .lineTo(columnPods[columnPods.length - 1].centerX, columnPods[columnPods.length - 1].centerY)
767
+ .stroke({
768
+ color: 0xf1b067,
769
+ alpha:
770
+ rackFabricAlpha *
771
+ Math.min(podEmphasis(columnPods[0].id), podEmphasis(columnPods[columnPods.length - 1].id)),
772
+ width: screenStroke(scale, 2.1, 0.12, 2),
773
+ })
774
+ }
775
+
776
+ const rackInnerAlpha = mix(0.02, 0.08, lod.weights.board)
777
+
778
+ for (const pod of visiblePods) {
779
+ const emphasis = podEmphasis(pod.id)
780
+ graphics
781
+ .roundRect(pod.x, pod.y, pod.width, pod.height, screenRadius(scale, 22, 1.2, 18))
782
+ .fill({
783
+ color: 0x08131c,
784
+ alpha: mix(pod.active ? 0.76 : 0.66, pod.active ? 0.88 : 0.8, lod.weights.board) * emphasis,
785
+ })
786
+ .stroke({
787
+ color: pod.active ? 0xe6dbb1 : 0x5ecfca,
788
+ alpha: (pod.active ? 0.34 : 0.14) * emphasis,
789
+ width: screenStroke(scale, pod.active ? 1.3 : 0.8, 0.08, 1),
790
+ })
791
+
792
+ if (rackInnerAlpha > 0.02) {
793
+ graphics
794
+ .roundRect(
795
+ pod.x + 8,
796
+ pod.y + 8,
797
+ pod.width - 16,
798
+ pod.height - 16,
799
+ screenRadius(scale, 18, 0.8, 14),
800
+ )
801
+ .stroke({
802
+ color: 0x6fd9cd,
803
+ alpha: rackInnerAlpha * emphasis,
804
+ width: screenStroke(scale, 0.45, 0.04, 0.5),
805
+ })
806
+ }
807
+ }
808
+ }
809
+
810
+ function TopologyScene({
811
+ model,
812
+ viewport,
813
+ surfaceSize,
814
+ hoveredTarget,
815
+ pinnedTarget,
816
+ linkedFocus,
817
+ linkedGpuIds,
818
+ linkedNodeIds,
819
+ linkedPodIds,
820
+ debugEnabled,
821
+ snapshotMode,
822
+ debugToggles,
823
+ onFpsChange,
824
+ }: {
825
+ model: TopologySceneModel
826
+ viewport: ViewportState
827
+ surfaceSize: { width: number; height: number }
828
+ hoveredTarget: HoverTarget | null
829
+ pinnedTarget: HoverTarget | null
830
+ linkedFocus: LinkedFocus | null
831
+ linkedGpuIds: Set<string>
832
+ linkedNodeIds: Set<string>
833
+ linkedPodIds: Set<string>
834
+ debugEnabled: boolean
835
+ snapshotMode: boolean
836
+ debugToggles: DebugToggles
837
+ onFpsChange: (value: number) => void
838
+ }) {
839
+ useExtend({ Container, Graphics, Text })
840
+ const { app } = useApplication()
841
+ const dynamicRef = useRef<PixiGraphics | null>(null)
842
+ const hoverRef = useRef<HoverTarget | null>(hoveredTarget)
843
+ const pinnedRef = useRef<HoverTarget | null>(pinnedTarget)
844
+ const statsRef = useRef({ elapsed: 0, frames: 0 })
845
+ const allGpus = useMemo(
846
+ () => model.nodes.flatMap((node) => node.gpus),
847
+ [model.nodes],
848
+ )
849
+ const gpuById = useMemo(() => new Map(allGpus.map((gpu) => [gpu.id, gpu])), [allGpus])
850
+ const nodeById = useMemo(() => new Map(model.nodes.map((node) => [node.id, node])), [model.nodes])
851
+ const podById = useMemo(() => new Map(model.pods.map((pod) => [pod.id, pod])), [model.pods])
852
+ const lodState = useMemo(() => getTopologyLodState(viewport.scale), [viewport.scale])
853
+ const worldViewportBounds = useMemo(
854
+ () =>
855
+ getWorldViewportBounds(
856
+ viewport,
857
+ surfaceSize.width,
858
+ surfaceSize.height,
859
+ screenWorld(viewport.scale, 180, 12, 240),
860
+ ),
861
+ [surfaceSize.height, surfaceSize.width, viewport],
862
+ )
863
+ const visiblePods = useMemo(
864
+ () => model.pods.filter((pod) => rectsIntersect(pod.hitBounds, worldViewportBounds)),
865
+ [model.pods, worldViewportBounds],
866
+ )
867
+ const visibleNodes = useMemo(
868
+ () => model.nodes.filter((node) => rectsIntersect(node.hitBounds, worldViewportBounds)),
869
+ [model.nodes, worldViewportBounds],
870
+ )
871
+ const visibleGpus = useMemo(
872
+ () => visibleNodes.flatMap((node) => node.gpus),
873
+ [visibleNodes],
874
+ )
875
+ const visibleLinks = useMemo(
876
+ () => ({
877
+ row: model.rowLinks.filter((link) =>
878
+ rectsIntersect(lineBounds(link.x1, link.y1, link.x2, link.y2, link.hitWidth), worldViewportBounds),
879
+ ),
880
+ column: model.columnLinks.filter((link) =>
881
+ rectsIntersect(lineBounds(link.x1, link.y1, link.x2, link.y2, link.hitWidth), worldViewportBounds),
882
+ ),
883
+ bus: model.busLinks.filter((link) =>
884
+ rectsIntersect(lineBounds(link.x1, link.y1, link.x2, link.y2, link.hitWidth), worldViewportBounds),
885
+ ),
886
+ }),
887
+ [model.busLinks, model.columnLinks, model.rowLinks, worldViewportBounds],
888
+ )
889
+ const visibleLinkCount = useMemo(
890
+ () => visibleLinks.row.length + visibleLinks.column.length + visibleLinks.bus.length,
891
+ [visibleLinks.bus.length, visibleLinks.column.length, visibleLinks.row.length],
892
+ )
893
+
894
+ useEffect(() => {
895
+ hoverRef.current = hoveredTarget
896
+ }, [hoveredTarget])
897
+
898
+ useEffect(() => {
899
+ pinnedRef.current = pinnedTarget
900
+ }, [pinnedTarget])
901
+
902
+ useEffect(() => {
903
+ if (debugEnabled || snapshotMode) {
904
+ window.__PIXI_TOPOLOGY_APP__ = app
905
+ return () => {
906
+ delete window.__PIXI_TOPOLOGY_APP__
907
+ }
908
+ }
909
+
910
+ return undefined
911
+ }, [app, debugEnabled, snapshotMode])
912
+
913
+ const getEmphasis = useCallback(
914
+ (kind: 'pod' | 'node' | 'gpu', id: string) => {
915
+ const focusTarget = pinnedRef.current ?? hoverRef.current
916
+ const base = 1
917
+ if (!focusTarget || lodState.deepIsolation <= 0.001) {
918
+ return base
919
+ }
920
+
921
+ const fadeTo = mix(1, 0.08, lodState.deepIsolation)
922
+ if (kind === 'gpu') {
923
+ if (focusTarget.kind === 'gpu') {
924
+ const gpu = gpuById.get(id)
925
+ const focusedGpu = gpuById.get(focusTarget.id)
926
+ if (!gpu || !focusedGpu) {
927
+ return fadeTo
928
+ }
929
+ if (gpu.id === focusedGpu.id) {
930
+ return 1
931
+ }
932
+ if (gpu.nodeId === focusedGpu.nodeId) {
933
+ return mix(1, 0.34, lodState.deepIsolation)
934
+ }
935
+ if (gpu.domainIndex === focusedGpu.domainIndex) {
936
+ return mix(1, 0.16, lodState.deepIsolation)
937
+ }
938
+ return fadeTo
939
+ }
940
+
941
+ if (focusTarget.kind === 'node') {
942
+ const gpu = gpuById.get(id)
943
+ const focusedNode = nodeById.get(focusTarget.id)
944
+ if (!gpu || !focusedNode) {
945
+ return fadeTo
946
+ }
947
+ if (gpu.nodeId === focusedNode.id) {
948
+ return mix(1, 0.9, lodState.deepIsolation * 0.2)
949
+ }
950
+ if (gpu.domainIndex === focusedNode.domainIndex) {
951
+ return mix(1, 0.18, lodState.deepIsolation)
952
+ }
953
+ return fadeTo
954
+ }
955
+
956
+ const gpu = gpuById.get(id)
957
+ const focusedPod = podById.get(focusTarget.id)
958
+ if (!gpu || !focusedPod) {
959
+ return fadeTo
960
+ }
961
+ return gpu.domainIndex === focusedPod.index ? mix(1, 0.72, lodState.deepIsolation * 0.3) : fadeTo
962
+ }
963
+
964
+ if (kind === 'node') {
965
+ const node = nodeById.get(id)
966
+ if (!node) {
967
+ return fadeTo
968
+ }
969
+ if (focusTarget.kind === 'gpu') {
970
+ const gpu = gpuById.get(focusTarget.id)
971
+ if (!gpu) {
972
+ return fadeTo
973
+ }
974
+ if (node.id === gpu.nodeId) {
975
+ return mix(1, 0.5, lodState.deepIsolation)
976
+ }
977
+ if (node.domainIndex === gpu.domainIndex) {
978
+ return mix(1, 0.18, lodState.deepIsolation)
979
+ }
980
+ return fadeTo
981
+ }
982
+
983
+ if (focusTarget.kind === 'node') {
984
+ const focusedNode = nodeById.get(focusTarget.id)
985
+ if (!focusedNode) {
986
+ return fadeTo
987
+ }
988
+ if (node.id === focusedNode.id) {
989
+ return 1
990
+ }
991
+ if (node.domainIndex === focusedNode.domainIndex) {
992
+ return mix(1, 0.2, lodState.deepIsolation)
993
+ }
994
+ return fadeTo
995
+ }
996
+
997
+ const focusedPod = podById.get(focusTarget.id)
998
+ if (!focusedPod) {
999
+ return fadeTo
1000
+ }
1001
+ return node.domainIndex === focusedPod.index ? mix(1, 0.3, lodState.deepIsolation) : fadeTo
1002
+ }
1003
+
1004
+ const pod = podById.get(id)
1005
+ if (!pod) {
1006
+ return fadeTo
1007
+ }
1008
+ if (focusTarget.kind === 'gpu') {
1009
+ const gpu = gpuById.get(focusTarget.id)
1010
+ return gpu && gpu.domainIndex === pod.index ? mix(1, 0.25, lodState.deepIsolation) : fadeTo
1011
+ }
1012
+ if (focusTarget.kind === 'node') {
1013
+ const node = nodeById.get(focusTarget.id)
1014
+ return node && node.domainIndex === pod.index ? mix(1, 0.32, lodState.deepIsolation) : fadeTo
1015
+ }
1016
+ return focusTarget.id === id ? 1 : fadeTo
1017
+ },
1018
+ [gpuById, lodState.deepIsolation, nodeById, podById],
1019
+ )
1020
+
1021
+ const drawStatic = useCallback(
1022
+ (graphics: PixiGraphics) => {
1023
+ graphics.clear()
1024
+
1025
+ drawCampusPods(graphics, model, viewport.scale, lodState, visiblePods, (podId) =>
1026
+ getEmphasis('pod', podId),
1027
+ )
1028
+
1029
+ const localStructurePresence = Math.max(
1030
+ lodState.weights.board,
1031
+ lodState.weights.package * 0.9,
1032
+ lodState.weights.silicon * 0.7,
1033
+ lodState.weights.micro * 0.45,
1034
+ )
1035
+ const connectorAlpha = 0.18 * localStructurePresence
1036
+ const linkPresence = mix(lodState.weights.overview * 0.35, 1, lodState.weights.board)
1037
+ const hubRadius = screenWorld(viewport.scale, 6, 0.1, 2.4)
1038
+
1039
+ const drawStaticLink = (link: (typeof model.rowLinks)[number]) => {
1040
+ const isRackScope = link.scope === 'rack'
1041
+ if (!isRackScope && localStructurePresence < 0.08) {
1042
+ return
1043
+ }
1044
+
1045
+ const rackFrom =
1046
+ isRackScope
1047
+ ? model.pods.find((pod) => pod.centerX === link.x1 && pod.centerY === link.y1)
1048
+ : null
1049
+ const rackTo =
1050
+ isRackScope
1051
+ ? model.pods.find((pod) => pod.centerX === link.x2 && pod.centerY === link.y2)
1052
+ : null
1053
+ const emphasis =
1054
+ isRackScope
1055
+ ? Math.min(
1056
+ rackFrom ? getEmphasis('pod', rackFrom.id) : 1,
1057
+ rackTo ? getEmphasis('pod', rackTo.id) : 1,
1058
+ )
1059
+ : 1
1060
+ graphics
1061
+ .moveTo(link.x1, link.y1)
1062
+ .lineTo(link.x2, link.y2)
1063
+ .stroke({
1064
+ color: link.color,
1065
+ alpha:
1066
+ (isRackScope
1067
+ ? 0.08 + link.load * 0.24
1068
+ : (0.04 + link.load * 0.12) * localStructurePresence) *
1069
+ linkPresence *
1070
+ emphasis,
1071
+ width: screenStroke(
1072
+ viewport.scale,
1073
+ isRackScope ? 1.6 + link.load * 2 : 0.75 + link.load * 0.9,
1074
+ 0.05,
1075
+ 2.2,
1076
+ ),
1077
+ })
1078
+ }
1079
+
1080
+ visibleLinks.row.forEach(drawStaticLink)
1081
+ visibleLinks.column.forEach(drawStaticLink)
1082
+ visibleLinks.bus.forEach((link) => {
1083
+ if (localStructurePresence < 0.12) {
1084
+ return
1085
+ }
1086
+
1087
+ graphics
1088
+ .moveTo(link.x1, link.y1)
1089
+ .lineTo(link.x2, link.y2)
1090
+ .stroke({
1091
+ color: link.color,
1092
+ alpha: (0.05 + link.load * 0.16) * linkPresence * localStructurePresence,
1093
+ width: screenStroke(viewport.scale, 0.55 + link.load * 0.55, 0.05, 1.1),
1094
+ })
1095
+ })
1096
+
1097
+ for (const node of visibleNodes) {
1098
+ const nodeEmphasis = getEmphasis('node', node.id)
1099
+ if (localStructurePresence >= 0.08) {
1100
+ drawNodeShell(
1101
+ graphics,
1102
+ node,
1103
+ viewport.scale,
1104
+ linkedNodeIds.has(node.id),
1105
+ debugToggles.heat,
1106
+ lodState,
1107
+ nodeEmphasis,
1108
+ )
1109
+
1110
+ graphics.circle(node.hubX, node.hubY, hubRadius).fill({
1111
+ color: linkedNodeIds.has(node.id) ? 0xffcf7a : 0x89f8ea,
1112
+ alpha:
1113
+ ((linkedNodeIds.has(node.id) ? 0.68 : 0.08 + node.interNodeLoad * 0.22) *
1114
+ nodeEmphasis *
1115
+ localStructurePresence),
1116
+ })
1117
+ }
1118
+
1119
+ for (const gpu of node.gpus) {
1120
+ const gpuEmphasis = getEmphasis('gpu', gpu.id)
1121
+ const gpuCenterX = gpu.x + gpu.width / 2
1122
+ const connectorStartY =
1123
+ gpu.y + gpu.height / 2 <= node.hubY ? gpu.y + gpu.height : gpu.y
1124
+ const connectorEndY =
1125
+ gpu.y + gpu.height / 2 <= node.hubY ? node.hubY - 4 : node.hubY + 4
1126
+
1127
+ if (localStructurePresence >= 0.08) {
1128
+ graphics
1129
+ .moveTo(gpuCenterX, connectorStartY)
1130
+ .lineTo(gpuCenterX, connectorEndY)
1131
+ .stroke({
1132
+ color: linkedGpuIds.has(gpu.id) ? 0xffd28a : 0x88efe0,
1133
+ alpha:
1134
+ (linkedGpuIds.has(gpu.id)
1135
+ ? 0.72
1136
+ : connectorAlpha * (gpu.active ? 0.38 + gpu.linkLoad * 0.34 : 0.12)) * gpuEmphasis,
1137
+ width: screenStroke(
1138
+ viewport.scale,
1139
+ linkedGpuIds.has(gpu.id) ? 1 : gpu.active ? 0.55 + gpu.linkLoad * 0.4 : 0.28,
1140
+ 0.03,
1141
+ 0.8,
1142
+ ),
1143
+ })
1144
+ }
1145
+
1146
+ drawModule(graphics, gpu, viewport.scale, linkedGpuIds.has(gpu.id), lodState, gpuEmphasis)
1147
+ }
1148
+ }
1149
+
1150
+ if (debugToggles.bounds) {
1151
+ for (const pod of visiblePods) {
1152
+ graphics.roundRect(
1153
+ pod.hitBounds.x,
1154
+ pod.hitBounds.y,
1155
+ pod.hitBounds.width,
1156
+ pod.hitBounds.height,
1157
+ screenRadius(viewport.scale, 24, 1.2, 16),
1158
+ ).stroke({
1159
+ color: 0xfde6ab,
1160
+ alpha: 0.18,
1161
+ width: screenStroke(viewport.scale, 1, 0.06, 1),
1162
+ })
1163
+ }
1164
+
1165
+ for (const node of visibleNodes) {
1166
+ graphics.roundRect(
1167
+ node.hitBounds.x,
1168
+ node.hitBounds.y,
1169
+ node.hitBounds.width,
1170
+ node.hitBounds.height,
1171
+ screenRadius(viewport.scale, 12, 0.6, 8),
1172
+ ).stroke({
1173
+ color: 0xfdf4cc,
1174
+ alpha: 0.34,
1175
+ width: screenStroke(viewport.scale, 1, 0.06, 1),
1176
+ })
1177
+
1178
+ for (const gpu of node.gpus) {
1179
+ graphics.roundRect(
1180
+ gpu.hitBounds.x,
1181
+ gpu.hitBounds.y,
1182
+ gpu.hitBounds.width,
1183
+ gpu.hitBounds.height,
1184
+ screenRadius(viewport.scale, 6, 0.4, 4),
1185
+ ).stroke({
1186
+ color: 0x7adfff,
1187
+ alpha: 0.24,
1188
+ width: screenStroke(viewport.scale, 1, 0.06, 1),
1189
+ })
1190
+ }
1191
+ }
1192
+ }
1193
+
1194
+ if (debugToggles.hitAreas) {
1195
+ for (const link of [...visibleLinks.row, ...visibleLinks.column, ...visibleLinks.bus]) {
1196
+ graphics
1197
+ .moveTo(link.x1, link.y1)
1198
+ .lineTo(link.x2, link.y2)
1199
+ .stroke({
1200
+ color: link.kind === 'column' ? 0x60aaf7 : 0xffd08a,
1201
+ alpha: 0.15,
1202
+ width: screenStroke(viewport.scale, link.hitWidth, 0.5, 16),
1203
+ })
1204
+ }
1205
+ }
1206
+ },
1207
+ [
1208
+ debugToggles.bounds,
1209
+ debugToggles.heat,
1210
+ debugToggles.hitAreas,
1211
+ getEmphasis,
1212
+ linkedGpuIds,
1213
+ linkedNodeIds,
1214
+ lodState,
1215
+ model,
1216
+ viewport.scale,
1217
+ visibleLinks.bus,
1218
+ visibleLinks.column,
1219
+ visibleLinks.row,
1220
+ visibleNodes,
1221
+ visiblePods,
1222
+ ],
1223
+ )
1224
+
1225
+ const redrawDynamic = useCallback(
1226
+ (timeMs: number) => {
1227
+ const graphics = dynamicRef.current
1228
+ if (!graphics) {
1229
+ return
1230
+ }
1231
+
1232
+ graphics.clear()
1233
+ const pulseTime = snapshotMode ? 0.42 : timeMs / 1000
1234
+ const visibleTarget = pinnedRef.current ?? hoverRef.current
1235
+ const linkGlowAlpha = mix(0.08, 0.18, lodState.weights.board)
1236
+ const animateLinkGlow =
1237
+ lodState.weights.board > 0.14 &&
1238
+ visibleLinkCount < 900 &&
1239
+ viewport.scale >= 0.28
1240
+
1241
+ const drawGlowLink = (link: (typeof model.rowLinks)[number], index: number, color: number) => {
1242
+ const glow = pulse(timeMs, index * 0.19, snapshotMode ? 0 : 0.12)
1243
+ graphics
1244
+ .moveTo(link.x1, link.y1)
1245
+ .lineTo(link.x2, link.y2)
1246
+ .stroke({
1247
+ color,
1248
+ alpha: linkGlowAlpha * (0.12 + link.load * 0.5) * glow,
1249
+ width: screenStroke(viewport.scale, 1.2 + link.load * 2.2, 0.08, 3.2),
1250
+ })
1251
+ }
1252
+
1253
+ if (animateLinkGlow) {
1254
+ visibleLinks.row.forEach((link, index) => {
1255
+ drawGlowLink(link, index, link.color)
1256
+ })
1257
+ visibleLinks.column.forEach((link, index) => {
1258
+ drawGlowLink(link, index + visibleLinks.row.length, link.color)
1259
+ })
1260
+ visibleLinks.bus.forEach((link, index) => {
1261
+ drawGlowLink(
1262
+ link,
1263
+ index + visibleLinks.row.length + visibleLinks.column.length,
1264
+ 0x9efef2,
1265
+ )
1266
+ })
1267
+ }
1268
+
1269
+ if (linkedFocus) {
1270
+ const wave = 0.58 + Math.sin(pulseTime * 2.4) * 0.18
1271
+
1272
+ for (const pod of visiblePods) {
1273
+ if (!linkedPodIds.has(pod.id)) {
1274
+ continue
1275
+ }
1276
+
1277
+ drawCornerFocus(graphics, pod.focusFrame, viewport.scale, 0xffd78e, wave, 18, 4, 2)
1278
+ }
1279
+
1280
+ for (const node of visibleNodes) {
1281
+ if (!linkedNodeIds.has(node.id)) {
1282
+ continue
1283
+ }
1284
+
1285
+ drawCornerFocus(graphics, node.focusFrame, viewport.scale, 0xffd78e, wave, 9, 2, 1.1)
1286
+ }
1287
+
1288
+ for (const gpu of visibleGpus) {
1289
+ if (!linkedGpuIds.has(gpu.id)) {
1290
+ continue
1291
+ }
1292
+
1293
+ drawCornerFocus(graphics, gpu.focusFrame, viewport.scale, 0xffefc3, wave + 0.12, 6, 1, 1)
1294
+ }
1295
+ }
1296
+
1297
+ if (!visibleTarget) {
1298
+ return
1299
+ }
1300
+
1301
+ if (visibleTarget.kind === 'pod') {
1302
+ const pod = podById.get(visibleTarget.id)
1303
+ if (!pod) {
1304
+ return
1305
+ }
1306
+
1307
+ drawCornerFocus(graphics, pod.focusFrame, viewport.scale, 0xf9f5bc, 0.86, 22, 6, 2.3)
1308
+ return
1309
+ }
1310
+
1311
+ if (visibleTarget.kind === 'node') {
1312
+ const node = nodeById.get(visibleTarget.id)
1313
+ if (!node) {
1314
+ return
1315
+ }
1316
+
1317
+ drawCornerFocus(graphics, node.focusFrame, viewport.scale, 0xf9f5bc, 0.9, 10, 2, 1.7)
1318
+ return
1319
+ }
1320
+
1321
+ if (visibleTarget.kind === 'gpu') {
1322
+ const gpu = gpuById.get(visibleTarget.id)
1323
+ if (!gpu) {
1324
+ return
1325
+ }
1326
+
1327
+ drawCornerFocus(graphics, gpu.focusFrame, viewport.scale, 0xffffff, 0.96, 7, 1.5, 1.3)
1328
+ return
1329
+ }
1330
+
1331
+ const link = [...model.rowLinks, ...model.columnLinks, ...model.busLinks].find(
1332
+ (item) => item.id === visibleTarget.id,
1333
+ )
1334
+ if (!link) {
1335
+ return
1336
+ }
1337
+
1338
+ graphics
1339
+ .moveTo(link.x1, link.y1)
1340
+ .lineTo(link.x2, link.y2)
1341
+ .stroke({
1342
+ color: 0xfef4c8,
1343
+ alpha: 0.92,
1344
+ width: screenStroke(viewport.scale, 2.6 + link.load * 2.8, 0.14, 4.2),
1345
+ })
1346
+ },
1347
+ [
1348
+ gpuById,
1349
+ linkedFocus,
1350
+ linkedGpuIds,
1351
+ linkedNodeIds,
1352
+ linkedPodIds,
1353
+ lodState.weights.board,
1354
+ model,
1355
+ nodeById,
1356
+ podById,
1357
+ snapshotMode,
1358
+ visibleGpus,
1359
+ visibleLinkCount,
1360
+ viewport.scale,
1361
+ visibleLinks.bus,
1362
+ visibleLinks.column,
1363
+ visibleLinks.row,
1364
+ visibleNodes,
1365
+ visiblePods,
1366
+ ],
1367
+ )
1368
+
1369
+ useEffect(() => {
1370
+ redrawDynamic(0)
1371
+ }, [redrawDynamic, hoveredTarget, pinnedTarget, linkedFocus])
1372
+
1373
+ useTick(
1374
+ useCallback(
1375
+ (ticker: Ticker) => {
1376
+ if (snapshotMode) {
1377
+ return
1378
+ }
1379
+
1380
+ const shouldAnimate =
1381
+ linkedFocus != null ||
1382
+ (lodState.weights.board > 0.14 &&
1383
+ visibleLinkCount < 900 &&
1384
+ viewport.scale >= 0.28)
1385
+
1386
+ if (!shouldAnimate) {
1387
+ return
1388
+ }
1389
+
1390
+ redrawDynamic(performance.now())
1391
+
1392
+ statsRef.current.elapsed += ticker.deltaMS
1393
+ statsRef.current.frames += 1
1394
+ if (statsRef.current.elapsed >= 500) {
1395
+ const fps = (statsRef.current.frames * 1000) / statsRef.current.elapsed
1396
+ onFpsChange(fps)
1397
+ statsRef.current.elapsed = 0
1398
+ statsRef.current.frames = 0
1399
+ }
1400
+ },
1401
+ [
1402
+ linkedFocus,
1403
+ lodState.weights.board,
1404
+ onFpsChange,
1405
+ redrawDynamic,
1406
+ snapshotMode,
1407
+ viewport.scale,
1408
+ visibleLinkCount,
1409
+ ],
1410
+ ),
1411
+ )
1412
+
1413
+ const debugLabels = debugEnabled && debugToggles.ids
1414
+
1415
+ return (
1416
+ <pixiContainer x={viewport.x} y={viewport.y} scale={viewport.scale}>
1417
+ <pixiGraphics draw={drawStatic} />
1418
+ <pixiGraphics ref={dynamicRef} draw={noopDraw} />
1419
+
1420
+ {debugLabels
1421
+ ? visiblePods.map((pod) => (
1422
+ <pixiText
1423
+ key={`pod-label-${pod.id}`}
1424
+ x={pod.x + 30}
1425
+ y={pod.y + 24}
1426
+ text={pod.active ? 'ACTIVE RACK' : `R${pod.index + 1}`}
1427
+ style={{
1428
+ fill: 0xdff7f0,
1429
+ fontSize: screenWorld(viewport.scale, 18, 3.5, 24) * lodState.textScale,
1430
+ fontFamily: 'IBM Plex Mono',
1431
+ letterSpacing: screenWorld(viewport.scale, 2, 0.2, 2),
1432
+ }}
1433
+ />
1434
+ ))
1435
+ : null}
1436
+
1437
+ {debugLabels
1438
+ ? visibleNodes.map((node) => (
1439
+ <pixiText
1440
+ key={`node-label-${node.id}`}
1441
+ x={node.x + 10}
1442
+ y={node.y + 8}
1443
+ text={`N${node.index + 1}`}
1444
+ style={{
1445
+ fill: 0xdff7f0,
1446
+ fontSize: screenWorld(viewport.scale, 8, 2, 10) * lodState.textScale,
1447
+ fontFamily: 'IBM Plex Mono',
1448
+ }}
1449
+ />
1450
+ ))
1451
+ : null}
1452
+ </pixiContainer>
1453
+ )
1454
+ }
1455
+
1456
+ export function ClusterMap({
1457
+ viewModel,
1458
+ debugEnabled,
1459
+ snapshotMode,
1460
+ linkedFocus,
1461
+ }: ClusterMapProps) {
1462
+ const model = useMemo(() => buildTopologySceneModel(viewModel), [viewModel])
1463
+ const [viewport, setViewport] = useState<ViewportState>({ x: 0, y: 0, scale: 1 })
1464
+ const [surfaceSize, setSurfaceSize] = useState({ width: 0, height: 0 })
1465
+ const [sceneReady, setSceneReady] = useState(false)
1466
+ const [hoveredTarget, setHoveredTarget] = useState<HoverTarget | null>(null)
1467
+ const [pinnedTarget, setPinnedTarget] = useState<HoverTarget | null>(null)
1468
+ const [isDragging, setIsDragging] = useState(false)
1469
+ const [fps, setFps] = useState(0)
1470
+ const [debugToggles, setDebugToggles] = useState<DebugToggles>({
1471
+ bounds: false,
1472
+ ids: false,
1473
+ heat: false,
1474
+ hitAreas: false,
1475
+ stats: true,
1476
+ })
1477
+ const surfaceRef = useRef<HTMLDivElement | null>(null)
1478
+ const interactionLayerRef = useRef<HTMLDivElement | null>(null)
1479
+ const interactionRef = useRef({
1480
+ dragging: false,
1481
+ moved: false,
1482
+ distance: 0,
1483
+ lastPointer: null as ScenePointer | null,
1484
+ pointers: new Map<number, ScenePointer>(),
1485
+ pinchDistance: 0,
1486
+ pinchMidpoint: null as ScenePointer | null,
1487
+ })
1488
+
1489
+ const linkedGpuIds = useMemo(() => {
1490
+ return new Set(
1491
+ model.nodes
1492
+ .flatMap((node) => node.gpus)
1493
+ .filter((gpu) => matchesLinkedFocus(gpu, linkedFocus))
1494
+ .map((gpu) => gpu.id),
1495
+ )
1496
+ }, [linkedFocus, model.nodes])
1497
+
1498
+ const linkedNodeIds = useMemo(() => {
1499
+ return new Set(
1500
+ model.nodes
1501
+ .filter((node) => node.gpus.some((gpu) => matchesLinkedFocus(gpu, linkedFocus)))
1502
+ .map((node) => node.id),
1503
+ )
1504
+ }, [linkedFocus, model.nodes])
1505
+
1506
+ const linkedPodIds = useMemo(() => {
1507
+ if (!linkedFocus) {
1508
+ return new Set<string>()
1509
+ }
1510
+
1511
+ return new Set(
1512
+ model.nodes
1513
+ .filter((node) => node.gpus.some((gpu) => matchesLinkedFocus(gpu, linkedFocus)))
1514
+ .map((node) => `pod-${node.domainIndex}`),
1515
+ )
1516
+ }, [linkedFocus, model.nodes])
1517
+
1518
+ useEffect(() => {
1519
+ if (surfaceSize.width === 0 || surfaceSize.height === 0) {
1520
+ return
1521
+ }
1522
+
1523
+ let settleFrame = 0
1524
+ const frame = requestAnimationFrame(() => {
1525
+ setViewport(getFitViewport(model, surfaceSize.width, surfaceSize.height))
1526
+ settleFrame = requestAnimationFrame(() => {
1527
+ setSceneReady(true)
1528
+ })
1529
+ })
1530
+
1531
+ return () => {
1532
+ cancelAnimationFrame(frame)
1533
+ cancelAnimationFrame(settleFrame)
1534
+ }
1535
+ }, [model, surfaceSize.height, surfaceSize.width])
1536
+
1537
+ const focusedDetails = useMemo<TargetDetails | null>(() => {
1538
+ return describeTarget(model, viewModel, pinnedTarget ?? hoveredTarget)
1539
+ }, [hoveredTarget, model, pinnedTarget, viewModel])
1540
+
1541
+ const debugObjects = useMemo(
1542
+ () => createDebugObjectMap(model, viewport),
1543
+ [model, viewport],
1544
+ )
1545
+ const detailLevel = useMemo(() => getTopologyLodState(viewport.scale).primaryBand, [viewport.scale])
1546
+ const viewportConstraints = useMemo(() => {
1547
+ if (surfaceSize.width === 0 || surfaceSize.height === 0) {
1548
+ return null
1549
+ }
1550
+
1551
+ return getViewportConstraints(model, surfaceSize.width, surfaceSize.height, viewport.scale)
1552
+ }, [model, surfaceSize.height, surfaceSize.width, viewport.scale])
1553
+
1554
+ useEffect(() => {
1555
+ if (!(debugEnabled || snapshotMode)) {
1556
+ delete window.__TOPOLOGY_DEBUG__
1557
+ return
1558
+ }
1559
+
1560
+ window.__TOPOLOGY_DEBUG__ = {
1561
+ ready: sceneReady,
1562
+ viewport,
1563
+ surfaceSize,
1564
+ objectCounts: model.objectCounts,
1565
+ objects: debugObjects,
1566
+ hoveredTarget,
1567
+ pinnedTarget,
1568
+ detailLevel,
1569
+ setViewport: (nextViewport: ViewportState) => {
1570
+ setViewport(clampViewportToScene(nextViewport, model, surfaceSize.width, surfaceSize.height))
1571
+ },
1572
+ }
1573
+
1574
+ return () => {
1575
+ delete window.__TOPOLOGY_DEBUG__
1576
+ }
1577
+ }, [
1578
+ debugEnabled,
1579
+ debugObjects,
1580
+ hoveredTarget,
1581
+ model.objectCounts,
1582
+ pinnedTarget,
1583
+ sceneReady,
1584
+ snapshotMode,
1585
+ surfaceSize,
1586
+ detailLevel,
1587
+ model,
1588
+ viewport,
1589
+ ])
1590
+
1591
+ const scenePointerFromClient = useCallback((clientX: number, clientY: number) => {
1592
+ const bounds = interactionLayerRef.current?.getBoundingClientRect()
1593
+ if (!bounds) {
1594
+ return null
1595
+ }
1596
+
1597
+ return {
1598
+ x: clientX - bounds.left,
1599
+ y: clientY - bounds.top,
1600
+ }
1601
+ }, [])
1602
+
1603
+ const scenePointerFromEvent = useCallback(
1604
+ (event: Pick<ReactPointerEvent<HTMLDivElement>, 'clientX' | 'clientY'>) =>
1605
+ scenePointerFromClient(event.clientX, event.clientY),
1606
+ [scenePointerFromClient],
1607
+ )
1608
+
1609
+ const toWorldPoint = useCallback(
1610
+ (pointer: ScenePointer) => ({
1611
+ x: (pointer.x - viewport.x) / viewport.scale,
1612
+ y: (pointer.y - viewport.y) / viewport.scale,
1613
+ }),
1614
+ [viewport],
1615
+ )
1616
+
1617
+ const setViewportClamped = useCallback(
1618
+ (updater: ViewportState | ((current: ViewportState) => ViewportState)) => {
1619
+ setViewport((current) => {
1620
+ const nextViewport =
1621
+ typeof updater === 'function'
1622
+ ? (updater as (current: ViewportState) => ViewportState)(current)
1623
+ : updater
1624
+
1625
+ return clampViewportToScene(nextViewport, model, surfaceSize.width, surfaceSize.height)
1626
+ })
1627
+ },
1628
+ [model, surfaceSize.height, surfaceSize.width],
1629
+ )
1630
+
1631
+ const applyZoomAtPointer = useCallback((screenPoint: ScenePointer, zoomFactor: number) => {
1632
+ setViewportClamped((current) => {
1633
+ const nextScale = clamp(
1634
+ current.scale * zoomFactor,
1635
+ viewportConstraints?.minScale ?? MIN_SCALE,
1636
+ viewportConstraints?.maxScale ?? MAX_SCALE,
1637
+ )
1638
+ const worldX = (screenPoint.x - current.x) / current.scale
1639
+ const worldY = (screenPoint.y - current.y) / current.scale
1640
+
1641
+ return {
1642
+ scale: nextScale,
1643
+ x: screenPoint.x - worldX * nextScale,
1644
+ y: screenPoint.y - worldY * nextScale,
1645
+ }
1646
+ })
1647
+ }, [setViewportClamped, viewportConstraints?.maxScale, viewportConstraints?.minScale])
1648
+
1649
+ const updateHoverFromPointer = useCallback(
1650
+ (pointer: ScenePointer | null) => {
1651
+ if (!pointer) {
1652
+ setHoveredTarget((current) => (current === null ? current : null))
1653
+ return
1654
+ }
1655
+
1656
+ const worldPoint = toWorldPoint(pointer)
1657
+ const next = findHoverTarget(model, worldPoint.x, worldPoint.y)
1658
+
1659
+ setHoveredTarget((current) => {
1660
+ if (current?.kind === next?.kind && current?.id === next?.id) {
1661
+ return current
1662
+ }
1663
+
1664
+ return next
1665
+ })
1666
+ },
1667
+ [model, toWorldPoint],
1668
+ )
1669
+
1670
+ const resetViewport = useCallback(() => {
1671
+ if (surfaceSize.width === 0 || surfaceSize.height === 0) {
1672
+ return
1673
+ }
1674
+
1675
+ setViewport(getFitViewport(model, surfaceSize.width, surfaceSize.height))
1676
+ }, [model, surfaceSize.height, surfaceSize.width])
1677
+
1678
+ const handleSurfaceSizeChange = useCallback((width: number, height: number) => {
1679
+ setSurfaceSize((current) => {
1680
+ if (current.width === width && current.height === height) {
1681
+ return current
1682
+ }
1683
+
1684
+ return { width, height }
1685
+ })
1686
+ setSceneReady(false)
1687
+ }, [])
1688
+
1689
+ useEffect(() => {
1690
+ const element = interactionLayerRef.current
1691
+ if (!element) {
1692
+ return
1693
+ }
1694
+
1695
+ const handleWheel = (event: WheelEvent) => {
1696
+ if (event.target instanceof Element && event.target.closest('.scene-inspector, .scene-debug-panel')) {
1697
+ return
1698
+ }
1699
+
1700
+ const pointer = scenePointerFromClient(event.clientX, event.clientY)
1701
+ if (!pointer) {
1702
+ return
1703
+ }
1704
+
1705
+ event.preventDefault()
1706
+ event.stopPropagation()
1707
+
1708
+ const delta = event.ctrlKey ? event.deltaY * 1.8 : event.deltaY
1709
+ const zoomFactor = Math.exp(-delta * 0.0015)
1710
+ applyZoomAtPointer(pointer, zoomFactor)
1711
+ }
1712
+
1713
+ element.addEventListener('wheel', handleWheel, { passive: false })
1714
+ return () => {
1715
+ element.removeEventListener('wheel', handleWheel)
1716
+ }
1717
+ }, [applyZoomAtPointer, scenePointerFromClient])
1718
+
1719
+ const togglePinnedTarget = useCallback(
1720
+ (pointer: ScenePointer) => {
1721
+ const worldPoint = toWorldPoint(pointer)
1722
+ const target = findHoverTarget(model, worldPoint.x, worldPoint.y)
1723
+
1724
+ if (!target || target.kind === 'link') {
1725
+ setPinnedTarget(null)
1726
+ return
1727
+ }
1728
+
1729
+ setPinnedTarget((current) => {
1730
+ if (current?.kind === target.kind && current.id === target.id) {
1731
+ return null
1732
+ }
1733
+
1734
+ return target
1735
+ })
1736
+ },
1737
+ [model, toWorldPoint],
1738
+ )
1739
+
1740
+ const handlePointerDown = useCallback(
1741
+ (event: ReactPointerEvent<HTMLDivElement>) => {
1742
+ if (event.target !== event.currentTarget) {
1743
+ return
1744
+ }
1745
+
1746
+ const pointer = scenePointerFromEvent(event)
1747
+ if (!pointer) {
1748
+ return
1749
+ }
1750
+
1751
+ const interaction = interactionRef.current
1752
+ interaction.pointers.set(event.pointerId, pointer)
1753
+ interaction.lastPointer = pointer
1754
+ interaction.moved = false
1755
+ interaction.distance = 0
1756
+
1757
+ if (interaction.pointers.size === 1) {
1758
+ interaction.dragging = true
1759
+ setIsDragging(true)
1760
+ } else if (interaction.pointers.size === 2) {
1761
+ const [first, second] = Array.from(interaction.pointers.values())
1762
+ const deltaX = second.x - first.x
1763
+ const deltaY = second.y - first.y
1764
+ interaction.dragging = false
1765
+ interaction.pinchDistance = Math.hypot(deltaX, deltaY)
1766
+ interaction.pinchMidpoint = {
1767
+ x: (first.x + second.x) / 2,
1768
+ y: (first.y + second.y) / 2,
1769
+ }
1770
+ setIsDragging(false)
1771
+ }
1772
+
1773
+ event.currentTarget.setPointerCapture(event.pointerId)
1774
+ },
1775
+ [scenePointerFromEvent],
1776
+ )
1777
+
1778
+ const handlePointerMove = useCallback(
1779
+ (event: ReactPointerEvent<HTMLDivElement>) => {
1780
+ const pointer = scenePointerFromEvent(event)
1781
+ if (!pointer) {
1782
+ return
1783
+ }
1784
+
1785
+ const interaction = interactionRef.current
1786
+ if (interaction.pointers.has(event.pointerId)) {
1787
+ interaction.pointers.set(event.pointerId, pointer)
1788
+ }
1789
+
1790
+ if (interaction.pointers.size === 2) {
1791
+ const [first, second] = Array.from(interaction.pointers.values())
1792
+ const deltaX = second.x - first.x
1793
+ const deltaY = second.y - first.y
1794
+ const distance = Math.max(Math.hypot(deltaX, deltaY), 1)
1795
+ const midpoint = {
1796
+ x: (first.x + second.x) / 2,
1797
+ y: (first.y + second.y) / 2,
1798
+ }
1799
+
1800
+ if (interaction.pinchDistance > 0 && interaction.pinchMidpoint) {
1801
+ const zoomFactor = distance / interaction.pinchDistance
1802
+ setViewportClamped((current) => {
1803
+ const nextScale = clamp(
1804
+ current.scale * zoomFactor,
1805
+ viewportConstraints?.minScale ?? MIN_SCALE,
1806
+ viewportConstraints?.maxScale ?? MAX_SCALE,
1807
+ )
1808
+ const worldX = (midpoint.x - current.x) / current.scale
1809
+ const worldY = (midpoint.y - current.y) / current.scale
1810
+
1811
+ return {
1812
+ scale: nextScale,
1813
+ x:
1814
+ midpoint.x -
1815
+ worldX * nextScale +
1816
+ (midpoint.x - interaction.pinchMidpoint!.x),
1817
+ y:
1818
+ midpoint.y -
1819
+ worldY * nextScale +
1820
+ (midpoint.y - interaction.pinchMidpoint!.y),
1821
+ }
1822
+ })
1823
+ }
1824
+
1825
+ interaction.pinchDistance = distance
1826
+ interaction.pinchMidpoint = midpoint
1827
+ interaction.moved = true
1828
+ return
1829
+ }
1830
+
1831
+ if (interaction.dragging && interaction.lastPointer) {
1832
+ const deltaMoveX = pointer.x - interaction.lastPointer.x
1833
+ const deltaMoveY = pointer.y - interaction.lastPointer.y
1834
+ interaction.lastPointer = pointer
1835
+ interaction.distance += Math.abs(deltaMoveX) + Math.abs(deltaMoveY)
1836
+
1837
+ if (interaction.distance > 2) {
1838
+ interaction.moved = true
1839
+ }
1840
+
1841
+ setViewportClamped((current) => ({
1842
+ ...current,
1843
+ x: current.x + deltaMoveX,
1844
+ y: current.y + deltaMoveY,
1845
+ }))
1846
+ return
1847
+ }
1848
+
1849
+ if (event.target !== event.currentTarget) {
1850
+ return
1851
+ }
1852
+
1853
+ updateHoverFromPointer(pointer)
1854
+ },
1855
+ [
1856
+ scenePointerFromEvent,
1857
+ setViewportClamped,
1858
+ updateHoverFromPointer,
1859
+ viewportConstraints?.maxScale,
1860
+ viewportConstraints?.minScale,
1861
+ ],
1862
+ )
1863
+
1864
+ const releasePointer = useCallback((pointerId: number) => {
1865
+ const interaction = interactionRef.current
1866
+ interaction.pointers.delete(pointerId)
1867
+
1868
+ if (interaction.pointers.size < 2) {
1869
+ interaction.pinchDistance = 0
1870
+ interaction.pinchMidpoint = null
1871
+ }
1872
+
1873
+ if (interaction.pointers.size === 0) {
1874
+ interaction.dragging = false
1875
+ interaction.lastPointer = null
1876
+ setIsDragging(false)
1877
+ return
1878
+ }
1879
+
1880
+ const remainingPointer = Array.from(interaction.pointers.values())[0]
1881
+ interaction.lastPointer = remainingPointer
1882
+ interaction.dragging = true
1883
+ }, [])
1884
+
1885
+ const handlePointerUp = useCallback(
1886
+ (event: ReactPointerEvent<HTMLDivElement>) => {
1887
+ const pointer = scenePointerFromEvent(event)
1888
+ const interaction = interactionRef.current
1889
+ const wasClick = !interaction.moved && interaction.distance < 8 && interaction.pointers.size <= 1
1890
+
1891
+ if (event.currentTarget.hasPointerCapture(event.pointerId)) {
1892
+ event.currentTarget.releasePointerCapture(event.pointerId)
1893
+ }
1894
+
1895
+ releasePointer(event.pointerId)
1896
+
1897
+ if (pointer) {
1898
+ updateHoverFromPointer(pointer)
1899
+ }
1900
+
1901
+ if (!pointer || !wasClick || event.target !== event.currentTarget) {
1902
+ return
1903
+ }
1904
+
1905
+ togglePinnedTarget(pointer)
1906
+ },
1907
+ [releasePointer, scenePointerFromEvent, togglePinnedTarget, updateHoverFromPointer],
1908
+ )
1909
+
1910
+ const handlePointerLeave = useCallback(() => {
1911
+ interactionRef.current.dragging = false
1912
+ interactionRef.current.lastPointer = null
1913
+ interactionRef.current.pointers.clear()
1914
+ interactionRef.current.pinchDistance = 0
1915
+ interactionRef.current.pinchMidpoint = null
1916
+ setIsDragging(false)
1917
+ setHoveredTarget(null)
1918
+ }, [])
1919
+
1920
+ const toggleDebugFlag = (key: keyof DebugToggles) => {
1921
+ setDebugToggles((current) => ({
1922
+ ...current,
1923
+ [key]: !current[key],
1924
+ }))
1925
+ }
1926
+
1927
+ const linkedSummary = linkedFocus ? linkedFocus.label : null
1928
+
1929
+ return (
1930
+ <div className="topology-scene-shell">
1931
+ <div className="scene-toolbar">
1932
+ <div className="scene-toolbar-actions">
1933
+ <button
1934
+ type="button"
1935
+ className="scene-button"
1936
+ onClick={resetViewport}
1937
+ data-testid="camera-reset"
1938
+ >
1939
+ reset camera
1940
+ </button>
1941
+ </div>
1942
+ </div>
1943
+
1944
+ <div
1945
+ ref={surfaceRef}
1946
+ className="pixi-surface-wrap topology-surface-wrap"
1947
+ >
1948
+ <PixiSurface
1949
+ className="pixi-surface"
1950
+ canvasClassName="pixi-canvas"
1951
+ testId="topology-scene"
1952
+ onSizeChange={handleSurfaceSizeChange}
1953
+ >
1954
+ {() => (
1955
+ <TopologyScene
1956
+ model={model}
1957
+ viewport={viewport}
1958
+ surfaceSize={surfaceSize}
1959
+ hoveredTarget={hoveredTarget}
1960
+ pinnedTarget={pinnedTarget}
1961
+ linkedFocus={linkedFocus}
1962
+ linkedGpuIds={linkedGpuIds}
1963
+ linkedNodeIds={linkedNodeIds}
1964
+ linkedPodIds={linkedPodIds}
1965
+ debugEnabled={debugEnabled}
1966
+ snapshotMode={snapshotMode}
1967
+ debugToggles={debugToggles}
1968
+ onFpsChange={setFps}
1969
+ />
1970
+ )}
1971
+ </PixiSurface>
1972
+
1973
+ <div
1974
+ ref={interactionLayerRef}
1975
+ className={`topology-interaction-layer${isDragging ? ' is-dragging' : ''}`}
1976
+ data-testid="topology-interaction-layer"
1977
+ onPointerDown={handlePointerDown}
1978
+ onPointerMove={handlePointerMove}
1979
+ onPointerUp={handlePointerUp}
1980
+ onPointerCancel={handlePointerLeave}
1981
+ onPointerLeave={handlePointerLeave}
1982
+ onDoubleClick={(event) => {
1983
+ if (event.target !== event.currentTarget) {
1984
+ return
1985
+ }
1986
+
1987
+ resetViewport()
1988
+ }}
1989
+ >
1990
+ <div className="scene-inspector" data-testid="topology-inspector">
1991
+ <p className="mini-label">
1992
+ {pinnedTarget ? 'Pinned target' : hoveredTarget ? 'Hover target' : 'Topology inspector'}
1993
+ </p>
1994
+ {focusedDetails ? (
1995
+ <>
1996
+ <h3>{focusedDetails.heading}</h3>
1997
+ <p className="inspector-subheading">{focusedDetails.subheading}</p>
1998
+ {linkedSummary ? (
1999
+ <p className="inspector-link-note">Transformer highlight: {linkedSummary}</p>
2000
+ ) : null}
2001
+ <dl className="inspector-grid">
2002
+ {focusedDetails.metrics.map((metric) => (
2003
+ <div key={`${focusedDetails.id}-${metric.label}`}>
2004
+ <dt>{metric.label}</dt>
2005
+ <dd>{metric.value}</dd>
2006
+ </div>
2007
+ ))}
2008
+ </dl>
2009
+ </>
2010
+ ) : (
2011
+ <>
2012
+ <h3>Inspect the cluster</h3>
2013
+ <p className="inspector-subheading">
2014
+ Hover a rack or GPU to inspect placement, memory headroom, and link load.
2015
+ Pan and zoom to move between fabric and package detail.
2016
+ </p>
2017
+ {linkedSummary ? (
2018
+ <p className="inspector-link-note">Transformer highlight: {linkedSummary}</p>
2019
+ ) : null}
2020
+ </>
2021
+ )}
2022
+ </div>
2023
+
2024
+ {(debugEnabled || snapshotMode) && (
2025
+ <div className="scene-debug-panel" data-testid="topology-debug">
2026
+ <p className="mini-label">Debug overlay</p>
2027
+ <div className="debug-toggle-grid">
2028
+ <label>
2029
+ <input
2030
+ type="checkbox"
2031
+ checked={debugToggles.bounds}
2032
+ onChange={() => toggleDebugFlag('bounds')}
2033
+ />
2034
+ Bounds
2035
+ </label>
2036
+ <label>
2037
+ <input
2038
+ type="checkbox"
2039
+ checked={debugToggles.ids}
2040
+ onChange={() => toggleDebugFlag('ids')}
2041
+ />
2042
+ Node / GPU ids
2043
+ </label>
2044
+ <label>
2045
+ <input
2046
+ type="checkbox"
2047
+ checked={debugToggles.heat}
2048
+ onChange={() => toggleDebugFlag('heat')}
2049
+ />
2050
+ Load heat
2051
+ </label>
2052
+ <label>
2053
+ <input
2054
+ type="checkbox"
2055
+ checked={debugToggles.hitAreas}
2056
+ onChange={() => toggleDebugFlag('hitAreas')}
2057
+ />
2058
+ Link hit areas
2059
+ </label>
2060
+ <label>
2061
+ <input
2062
+ type="checkbox"
2063
+ checked={debugToggles.stats}
2064
+ onChange={() => toggleDebugFlag('stats')}
2065
+ />
2066
+ FPS / counts
2067
+ </label>
2068
+ </div>
2069
+
2070
+ {debugToggles.stats ? (
2071
+ <div className="debug-stats">
2072
+ <span>FPS {snapshotMode ? 'snapshot' : fps.toFixed(0)}</span>
2073
+ <span>Racks {model.objectCounts.pods}</span>
2074
+ <span>Nodes {model.objectCounts.nodes}</span>
2075
+ <span>GPUs {model.objectCounts.gpus}</span>
2076
+ <span>Detail {detailLevel}</span>
2077
+ <span>Zoom {viewport.scale.toFixed(2)}x</span>
2078
+ </div>
2079
+ ) : null}
2080
+ </div>
2081
+ )}
2082
+ </div>
2083
+ </div>
2084
+ </div>
2085
+ )
2086
+ }
src/components/ControlsPanel.tsx ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ applyGpuPreset,
3
+ applyExamplePhase,
4
+ applyExamplePreset,
5
+ getExamplePresetOptions,
6
+ getFactorOptions,
7
+ getGpuPresetId,
8
+ getGpuPresetOptions,
9
+ getPhaseOptions,
10
+ type WorkbenchConfig,
11
+ } from '../lib/workbench'
12
+ import { type WorkbenchViewModel } from '../lib/workbenchPresenter'
13
+
14
+ type ControlsPanelProps = {
15
+ config: WorkbenchConfig
16
+ onChange: (next: WorkbenchConfig) => void
17
+ onReset: () => void
18
+ viewModel: WorkbenchViewModel
19
+ }
20
+
21
+ type OptionStripProps = {
22
+ label: string
23
+ caption: string
24
+ value: number
25
+ options: number[]
26
+ onSelect: (value: number) => void
27
+ }
28
+
29
+ type SelectFieldProps = {
30
+ label: string
31
+ value: string
32
+ options: Array<{ value: string; label: string }>
33
+ onChange: (value: string) => void
34
+ }
35
+
36
+ type NumberFieldProps = {
37
+ label: string
38
+ value: number
39
+ min: number
40
+ step?: number
41
+ onChange: (value: number) => void
42
+ }
43
+
44
+ type ToggleFieldProps = {
45
+ label: string
46
+ checked: boolean
47
+ onChange: (checked: boolean) => void
48
+ }
49
+
50
+ function OptionStrip({ label, caption, value, options, onSelect }: OptionStripProps) {
51
+ return (
52
+ <div className="control-card">
53
+ <div className="control-card-header">
54
+ <div>
55
+ <h3>{label}</h3>
56
+ <p>{caption}</p>
57
+ </div>
58
+ <span className="control-badge">{value}x</span>
59
+ </div>
60
+
61
+ <div className="option-strip" role="group" aria-label={label}>
62
+ {options.map((option) => (
63
+ <button
64
+ key={option}
65
+ type="button"
66
+ className={option === value ? 'option-chip active' : 'option-chip'}
67
+ onClick={() => onSelect(option)}
68
+ >
69
+ {option}
70
+ </button>
71
+ ))}
72
+ </div>
73
+ </div>
74
+ )
75
+ }
76
+
77
+ function SelectField({ label, value, options, onChange }: SelectFieldProps) {
78
+ return (
79
+ <label className="control-field">
80
+ <span>{label}</span>
81
+ <select value={value} onChange={(event) => onChange(event.target.value)}>
82
+ {options.map((option) => (
83
+ <option key={option.value} value={option.value}>
84
+ {option.label}
85
+ </option>
86
+ ))}
87
+ </select>
88
+ </label>
89
+ )
90
+ }
91
+
92
+ function NumberField({ label, value, min, step = 1, onChange }: NumberFieldProps) {
93
+ return (
94
+ <label className="control-field">
95
+ <span>{label}</span>
96
+ <input
97
+ type="number"
98
+ min={min}
99
+ step={step}
100
+ value={value}
101
+ onChange={(event) => onChange(Math.max(min, Number(event.target.value) || min))}
102
+ />
103
+ </label>
104
+ )
105
+ }
106
+
107
+ function ToggleField({ label, checked, onChange }: ToggleFieldProps) {
108
+ return (
109
+ <label className="control-toggle">
110
+ <input
111
+ type="checkbox"
112
+ checked={checked}
113
+ onChange={(event) => onChange(event.target.checked)}
114
+ />
115
+ <span>{label}</span>
116
+ </label>
117
+ )
118
+ }
119
+
120
+ export function ControlsPanel({
121
+ config,
122
+ onChange,
123
+ onReset,
124
+ viewModel,
125
+ }: ControlsPanelProps) {
126
+ const totalGPUs = config.cluster.gpusPerNode * config.cluster.numNodes
127
+ const factorOptions = {
128
+ tp: getFactorOptions(config.cluster.gpusPerNode, config.parallelism.tp),
129
+ pp: getFactorOptions(totalGPUs, config.parallelism.pp),
130
+ cp: getFactorOptions(totalGPUs, config.parallelism.cp),
131
+ ep: getFactorOptions(config.cluster.gpusPerNode, config.parallelism.ep),
132
+ }
133
+ const gpuPresetId = getGpuPresetId(config.cluster.gpuType)
134
+
135
+ const updateTraining = <K extends keyof WorkbenchConfig['training']>(
136
+ key: K,
137
+ value: WorkbenchConfig['training'][K],
138
+ ) => {
139
+ onChange({
140
+ ...config,
141
+ customized: true,
142
+ training: {
143
+ ...config.training,
144
+ [key]: value,
145
+ },
146
+ })
147
+ }
148
+
149
+ const updateModel = <K extends keyof WorkbenchConfig['model']>(
150
+ key: K,
151
+ value: WorkbenchConfig['model'][K],
152
+ ) => {
153
+ onChange({
154
+ ...config,
155
+ customized: true,
156
+ model: {
157
+ ...config.model,
158
+ [key]: value,
159
+ },
160
+ })
161
+ }
162
+
163
+ const updateCluster = <K extends keyof WorkbenchConfig['cluster']>(
164
+ key: K,
165
+ value: WorkbenchConfig['cluster'][K],
166
+ ) => {
167
+ onChange({
168
+ ...config,
169
+ customized: true,
170
+ cluster: {
171
+ ...config.cluster,
172
+ [key]: value,
173
+ },
174
+ })
175
+ }
176
+
177
+ const updateAttentionProfile = (
178
+ nextAttentionProfile: WorkbenchConfig['model']['attentionProfile'],
179
+ ) => {
180
+ updateModel('attentionProfile', nextAttentionProfile)
181
+ }
182
+
183
+ const updateMoe = (nextMoe: NonNullable<WorkbenchConfig['model']['moe']> | undefined) => {
184
+ updateModel('moe', nextMoe)
185
+ }
186
+
187
+ const updateParallelism = <K extends keyof WorkbenchConfig['parallelism']>(
188
+ key: K,
189
+ value: WorkbenchConfig['parallelism'][K],
190
+ ) => {
191
+ onChange({
192
+ ...config,
193
+ customized: true,
194
+ parallelism: {
195
+ ...config.parallelism,
196
+ [key]: value,
197
+ },
198
+ })
199
+ }
200
+
201
+ return (
202
+ <section className="controls-band">
203
+ <div className="controls-head">
204
+ <div>
205
+ <p className="mini-label">Controls</p>
206
+ <h2>Example / model / cluster / training / parallelism</h2>
207
+ </div>
208
+
209
+ <div className="controls-meta">
210
+ <span>{viewModel.analysis.totalGPUs.toLocaleString()} GPUs in cluster</span>
211
+ <span>
212
+ {config.parallelism.tp}×{config.parallelism.pp}×{config.parallelism.cp}×
213
+ {config.parallelism.ep} · DP {viewModel.analysis.derivedParallelism.dp}
214
+ </span>
215
+ <button type="button" className="reset-chip" onClick={onReset}>
216
+ reset
217
+ </button>
218
+ </div>
219
+ </div>
220
+
221
+ <div className="controls-stack">
222
+ <div className="control-card">
223
+ <div className="control-card-header">
224
+ <div>
225
+ <h3>Examples</h3>
226
+ <p>Built-in frontier runs as starting points for custom model and cluster edits</p>
227
+ </div>
228
+ {config.customized ? <span className="control-badge">customized</span> : null}
229
+ </div>
230
+
231
+ <div className="field-grid">
232
+ <SelectField
233
+ label="Example preset"
234
+ value={config.examplePresetId}
235
+ options={getExamplePresetOptions().map((option) => ({
236
+ value: option.id,
237
+ label: option.label,
238
+ }))}
239
+ onChange={(value) =>
240
+ onChange(applyExamplePreset(config, value as typeof config.examplePresetId))
241
+ }
242
+ />
243
+ <SelectField
244
+ label="Phase"
245
+ value={config.phaseId}
246
+ options={getPhaseOptions(config.examplePresetId).map((option) => ({
247
+ value: option.id,
248
+ label: option.label,
249
+ }))}
250
+ onChange={(value) => onChange(applyExamplePhase(config, value as typeof config.phaseId))}
251
+ />
252
+ </div>
253
+ </div>
254
+
255
+ <div className="control-card">
256
+ <div className="control-card-header">
257
+ <div>
258
+ <h3>Model</h3>
259
+ <p>Core architecture dimensions, attention layout, and optional MoE settings</p>
260
+ </div>
261
+ </div>
262
+
263
+ <div className="field-grid field-grid-wide">
264
+ <SelectField
265
+ label="Architecture"
266
+ value={config.model.architecture}
267
+ options={[
268
+ { value: 'dense', label: 'dense' },
269
+ { value: 'moe', label: 'moe' },
270
+ ]}
271
+ onChange={(value) => {
272
+ const architecture = value as WorkbenchConfig['model']['architecture']
273
+ onChange({
274
+ ...config,
275
+ customized: true,
276
+ model: {
277
+ ...config.model,
278
+ architecture,
279
+ moe:
280
+ architecture === 'moe'
281
+ ? (config.model.moe ?? {
282
+ numExperts: 64,
283
+ expertsPerToken: 2,
284
+ numDenseLayers: Math.min(config.model.numLayers, 4),
285
+ expertIntermediateSize: Math.max(1024, config.model.hiddenDim),
286
+ activeParamsPerToken: undefined,
287
+ })
288
+ : undefined,
289
+ },
290
+ })
291
+ }}
292
+ />
293
+ <NumberField
294
+ label="Hidden dim"
295
+ value={config.model.hiddenDim}
296
+ min={128}
297
+ step={128}
298
+ onChange={(value) => updateModel('hiddenDim', value)}
299
+ />
300
+ <NumberField
301
+ label="Layers"
302
+ value={config.model.numLayers}
303
+ min={1}
304
+ onChange={(value) => updateModel('numLayers', value)}
305
+ />
306
+ <NumberField
307
+ label="Attention heads"
308
+ value={config.model.numHeads}
309
+ min={1}
310
+ onChange={(value) => updateModel('numHeads', value)}
311
+ />
312
+ <NumberField
313
+ label="KV heads"
314
+ value={config.model.numKVHeads}
315
+ min={1}
316
+ onChange={(value) => updateModel('numKVHeads', value)}
317
+ />
318
+ <NumberField
319
+ label="Vocab size"
320
+ value={config.model.vocabSize}
321
+ min={256}
322
+ onChange={(value) => updateModel('vocabSize', value)}
323
+ />
324
+ <NumberField
325
+ label="Intermediate size"
326
+ value={config.model.intermediateSize}
327
+ min={256}
328
+ onChange={(value) => updateModel('intermediateSize', value)}
329
+ />
330
+ <SelectField
331
+ label="Attention profile"
332
+ value={config.model.attentionProfile?.type ?? 'full'}
333
+ options={[
334
+ { value: 'full', label: 'full attention' },
335
+ { value: 'hybrid', label: 'hybrid attention' },
336
+ ]}
337
+ onChange={(value) => {
338
+ if (value === 'full') {
339
+ updateAttentionProfile({ type: 'full' })
340
+ return
341
+ }
342
+
343
+ updateAttentionProfile({
344
+ type: 'hybrid',
345
+ slidingWindowSize: config.model.attentionProfile?.slidingWindowSize ?? 4096,
346
+ globalAttentionFraction: config.model.attentionProfile?.globalAttentionFraction,
347
+ globalAttentionEveryN: config.model.attentionProfile?.globalAttentionEveryN,
348
+ })
349
+ }}
350
+ />
351
+ <div className="control-field control-field-toggle">
352
+ <span>Tied embeddings</span>
353
+ <ToggleField
354
+ label="share output head"
355
+ checked={config.model.tiedEmbeddings}
356
+ onChange={(value) => updateModel('tiedEmbeddings', value)}
357
+ />
358
+ </div>
359
+
360
+ {config.model.attentionProfile?.type === 'hybrid' ? (
361
+ <>
362
+ <NumberField
363
+ label="Sliding window"
364
+ value={config.model.attentionProfile.slidingWindowSize ?? 4096}
365
+ min={0}
366
+ step={256}
367
+ onChange={(value) =>
368
+ updateAttentionProfile({
369
+ ...config.model.attentionProfile,
370
+ type: 'hybrid',
371
+ slidingWindowSize: value > 0 ? value : undefined,
372
+ })
373
+ }
374
+ />
375
+ <NumberField
376
+ label="Global attention fraction"
377
+ value={config.model.attentionProfile.globalAttentionFraction ?? 0}
378
+ min={0}
379
+ step={0.05}
380
+ onChange={(value) =>
381
+ updateAttentionProfile({
382
+ ...config.model.attentionProfile,
383
+ type: 'hybrid',
384
+ globalAttentionFraction: value > 0 ? value : undefined,
385
+ })
386
+ }
387
+ />
388
+ <NumberField
389
+ label="Global every N"
390
+ value={config.model.attentionProfile.globalAttentionEveryN ?? 0}
391
+ min={0}
392
+ onChange={(value) =>
393
+ updateAttentionProfile({
394
+ ...config.model.attentionProfile,
395
+ type: 'hybrid',
396
+ globalAttentionEveryN: value > 0 ? value : undefined,
397
+ })
398
+ }
399
+ />
400
+ </>
401
+ ) : null}
402
+
403
+ {config.model.architecture === 'moe' && config.model.moe ? (
404
+ <>
405
+ <NumberField
406
+ label="Experts"
407
+ value={config.model.moe.numExperts}
408
+ min={1}
409
+ onChange={(value) => updateMoe({ ...config.model.moe!, numExperts: value })}
410
+ />
411
+ <NumberField
412
+ label="Experts per token"
413
+ value={config.model.moe.expertsPerToken}
414
+ min={1}
415
+ onChange={(value) =>
416
+ updateMoe({ ...config.model.moe!, expertsPerToken: value })
417
+ }
418
+ />
419
+ <NumberField
420
+ label="Dense layers"
421
+ value={config.model.moe.numDenseLayers}
422
+ min={0}
423
+ onChange={(value) => updateMoe({ ...config.model.moe!, numDenseLayers: value })}
424
+ />
425
+ <NumberField
426
+ label="Expert intermediate"
427
+ value={config.model.moe.expertIntermediateSize}
428
+ min={256}
429
+ onChange={(value) =>
430
+ updateMoe({ ...config.model.moe!, expertIntermediateSize: value })
431
+ }
432
+ />
433
+ <NumberField
434
+ label="Active params / token"
435
+ value={config.model.moe.activeParamsPerToken ?? 0}
436
+ min={0}
437
+ step={1000000}
438
+ onChange={(value) =>
439
+ updateMoe({
440
+ ...config.model.moe!,
441
+ activeParamsPerToken: value > 0 ? value : undefined,
442
+ })
443
+ }
444
+ />
445
+ </>
446
+ ) : null}
447
+ </div>
448
+ </div>
449
+
450
+ <div className="control-card">
451
+ <div className="control-card-header">
452
+ <div>
453
+ <h3>Cluster</h3>
454
+ <p>GPU spec, node count, and rack-local topology for the physical fabric</p>
455
+ </div>
456
+ </div>
457
+
458
+ <div className="field-grid field-grid-wide">
459
+ <SelectField
460
+ label="GPU preset"
461
+ value={gpuPresetId}
462
+ options={[
463
+ ...getGpuPresetOptions().map((option) => ({
464
+ value: option.id,
465
+ label: option.label,
466
+ })),
467
+ { value: 'custom', label: 'custom GPU' },
468
+ ]}
469
+ onChange={(value) => {
470
+ if (value === 'custom') {
471
+ return
472
+ }
473
+
474
+ onChange(applyGpuPreset(config, value as Parameters<typeof applyGpuPreset>[1]))
475
+ }}
476
+ />
477
+ <label className="control-field">
478
+ <span>GPU name</span>
479
+ <input
480
+ type="text"
481
+ value={config.cluster.gpuType.name}
482
+ onChange={(event) =>
483
+ updateCluster('gpuType', {
484
+ ...config.cluster.gpuType,
485
+ name: event.target.value,
486
+ })
487
+ }
488
+ />
489
+ </label>
490
+ <NumberField
491
+ label="HBM capacity (GB)"
492
+ value={config.cluster.gpuType.hbmCapacityGB}
493
+ min={1}
494
+ onChange={(value) =>
495
+ updateCluster('gpuType', {
496
+ ...config.cluster.gpuType,
497
+ hbmCapacityGB: value,
498
+ })
499
+ }
500
+ />
501
+ <NumberField
502
+ label="Peak BF16 TFLOPs"
503
+ value={config.cluster.gpuType.peakTFLOPsBF16}
504
+ min={1}
505
+ onChange={(value) =>
506
+ updateCluster('gpuType', {
507
+ ...config.cluster.gpuType,
508
+ peakTFLOPsBF16: value,
509
+ })
510
+ }
511
+ />
512
+ <NumberField
513
+ label="HBM bandwidth (TB/s)"
514
+ value={config.cluster.gpuType.memBandwidthTBs}
515
+ min={0.1}
516
+ step={0.05}
517
+ onChange={(value) =>
518
+ updateCluster('gpuType', {
519
+ ...config.cluster.gpuType,
520
+ memBandwidthTBs: value,
521
+ })
522
+ }
523
+ />
524
+ <NumberField
525
+ label="GPUs per node"
526
+ value={config.cluster.gpusPerNode}
527
+ min={1}
528
+ onChange={(value) => updateCluster('gpusPerNode', value)}
529
+ />
530
+ <NumberField
531
+ label="Nodes"
532
+ value={config.cluster.numNodes}
533
+ min={1}
534
+ onChange={(value) => updateCluster('numNodes', value)}
535
+ />
536
+ <NumberField
537
+ label="Nodes per rack"
538
+ value={config.cluster.nodesPerRack ?? config.cluster.numNodes}
539
+ min={1}
540
+ onChange={(value) => updateCluster('nodesPerRack', value)}
541
+ />
542
+ <NumberField
543
+ label="Intra-node bandwidth (GB/s)"
544
+ value={config.cluster.intraNodeBandwidthGBs}
545
+ min={1}
546
+ onChange={(value) => updateCluster('intraNodeBandwidthGBs', value)}
547
+ />
548
+ <NumberField
549
+ label="Inter-node bandwidth (GB/s)"
550
+ value={config.cluster.interNodeBandwidthGBs}
551
+ min={1}
552
+ onChange={(value) => updateCluster('interNodeBandwidthGBs', value)}
553
+ />
554
+ </div>
555
+ </div>
556
+
557
+ <div className="control-card">
558
+ <div className="control-card-header">
559
+ <div>
560
+ <h3>Training</h3>
561
+ <p>Batching, precision, optimizer state, and recompute</p>
562
+ </div>
563
+ </div>
564
+
565
+ <div className="field-grid field-grid-wide">
566
+ <NumberField
567
+ label="Micro-batch"
568
+ value={config.training.microBatchSize}
569
+ min={1}
570
+ onChange={(value) => updateTraining('microBatchSize', value)}
571
+ />
572
+ <NumberField
573
+ label="Seq length"
574
+ value={config.training.seqLength}
575
+ min={256}
576
+ step={256}
577
+ onChange={(value) => updateTraining('seqLength', value)}
578
+ />
579
+ <NumberField
580
+ label="Grad accum"
581
+ value={config.training.gradAccumSteps}
582
+ min={1}
583
+ onChange={(value) => updateTraining('gradAccumSteps', value)}
584
+ />
585
+ <SelectField
586
+ label="Precision"
587
+ value={config.training.precision}
588
+ options={[
589
+ { value: 'bf16', label: 'bf16' },
590
+ { value: 'fp16', label: 'fp16' },
591
+ { value: 'fp8', label: 'fp8' },
592
+ { value: 'fp32', label: 'fp32' },
593
+ ]}
594
+ onChange={(value) =>
595
+ updateTraining('precision', value as WorkbenchConfig['training']['precision'])
596
+ }
597
+ />
598
+ <SelectField
599
+ label="Optimizer"
600
+ value={config.training.optimizer}
601
+ options={[
602
+ { value: 'adamw', label: 'adamw' },
603
+ { value: 'adam', label: 'adam' },
604
+ { value: 'muon', label: 'muon' },
605
+ { value: 'sgd', label: 'sgd' },
606
+ ]}
607
+ onChange={(value) =>
608
+ updateTraining('optimizer', value as WorkbenchConfig['training']['optimizer'])
609
+ }
610
+ />
611
+ <div className="control-field control-field-toggle">
612
+ <span>Activation recompute</span>
613
+ <ToggleField
614
+ label="checkpointing enabled"
615
+ checked={config.training.activationCheckpointing}
616
+ onChange={(value) => updateTraining('activationCheckpointing', value)}
617
+ />
618
+ </div>
619
+ </div>
620
+ </div>
621
+
622
+ <div className="controls-grid controls-grid-parallelism">
623
+ <OptionStrip
624
+ label="TP"
625
+ caption="tensor shards"
626
+ value={config.parallelism.tp}
627
+ options={factorOptions.tp}
628
+ onSelect={(value) => updateParallelism('tp', value)}
629
+ />
630
+ <OptionStrip
631
+ label="PP"
632
+ caption="pipeline stages"
633
+ value={config.parallelism.pp}
634
+ options={factorOptions.pp}
635
+ onSelect={(value) => updateParallelism('pp', value)}
636
+ />
637
+ <OptionStrip
638
+ label="CP"
639
+ caption="context shards"
640
+ value={config.parallelism.cp}
641
+ options={factorOptions.cp}
642
+ onSelect={(value) => updateParallelism('cp', value)}
643
+ />
644
+ <OptionStrip
645
+ label="EP"
646
+ caption="expert lanes"
647
+ value={config.parallelism.ep}
648
+ options={factorOptions.ep}
649
+ onSelect={(value) => updateParallelism('ep', value)}
650
+ />
651
+ <OptionStrip
652
+ label="ZeRO"
653
+ caption="state sharding"
654
+ value={config.parallelism.zeroStage}
655
+ options={[0, 1, 2, 3]}
656
+ onSelect={(value) => updateParallelism('zeroStage', value as 0 | 1 | 2 | 3)}
657
+ />
658
+ <div className="control-card">
659
+ <div className="control-card-header">
660
+ <div>
661
+ <h3>Megatron-style derived DP</h3>
662
+ <p>DP is derived from cluster size and the other parallel axes</p>
663
+ </div>
664
+ <span className="control-badge">{viewModel.analysis.derivedParallelism.dp}x</span>
665
+ </div>
666
+ <div className="field-grid">
667
+ <NumberField
668
+ label="FSDP shard group"
669
+ value={config.parallelism.fsdpShardGroupSize}
670
+ min={0}
671
+ step={8}
672
+ onChange={(value) => updateParallelism('fsdpShardGroupSize', value)}
673
+ />
674
+ <div className="control-field control-field-toggle">
675
+ <span>Distributed optimizer</span>
676
+ <ToggleField
677
+ label="optimizer state sharding"
678
+ checked={config.parallelism.distributedOptimizer}
679
+ onChange={(value) => updateParallelism('distributedOptimizer', value)}
680
+ />
681
+ </div>
682
+ </div>
683
+ </div>
684
+ </div>
685
+ </div>
686
+ </section>
687
+ )
688
+ }
src/components/pixi/PixiSurface.tsx ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Application } from '@pixi/react'
2
+ import { useEffect, useRef, type ReactNode } from 'react'
3
+ import { useElementSize } from '../../hooks/useElementSize'
4
+
5
+ type PixiSurfaceProps = {
6
+ className?: string
7
+ canvasClassName?: string
8
+ testId?: string
9
+ onSizeChange?: (width: number, height: number) => void
10
+ children: (size: { width: number; height: number }) => ReactNode
11
+ }
12
+
13
+ export function PixiSurface({
14
+ className,
15
+ canvasClassName,
16
+ testId,
17
+ onSizeChange,
18
+ children,
19
+ }: PixiSurfaceProps) {
20
+ const hostRef = useRef<HTMLDivElement>(null)
21
+ const size = useElementSize(hostRef)
22
+
23
+ useEffect(() => {
24
+ if (!onSizeChange || size.width === 0 || size.height === 0) {
25
+ return
26
+ }
27
+
28
+ onSizeChange(size.width, size.height)
29
+ }, [onSizeChange, size.height, size.width])
30
+
31
+ return (
32
+ <div ref={hostRef} className={className} data-testid={testId}>
33
+ {size.width > 0 && size.height > 0 ? (
34
+ <Application
35
+ className={canvasClassName}
36
+ resizeTo={hostRef}
37
+ preference="webgl"
38
+ antialias
39
+ autoDensity
40
+ backgroundAlpha={0}
41
+ clearBeforeRender
42
+ sharedTicker={false}
43
+ resolution={1}
44
+ >
45
+ {children(size)}
46
+ </Application>
47
+ ) : null}
48
+ </div>
49
+ )
50
+ }
src/hooks/useElementSize.ts ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useState, type RefObject } from 'react'
2
+
3
+ type ElementSize = {
4
+ width: number
5
+ height: number
6
+ }
7
+
8
+ export function useElementSize<T extends HTMLElement>(
9
+ ref: RefObject<T | null>,
10
+ ): ElementSize {
11
+ const [size, setSize] = useState<ElementSize>({ width: 0, height: 0 })
12
+
13
+ useEffect(() => {
14
+ const element = ref.current
15
+ if (!element) {
16
+ return
17
+ }
18
+
19
+ const observer = new ResizeObserver((entries) => {
20
+ const entry = entries[0]
21
+ if (!entry) {
22
+ return
23
+ }
24
+
25
+ const nextWidth = Math.round(entry.contentRect.width)
26
+ const nextHeight = Math.round(entry.contentRect.height)
27
+
28
+ setSize((current) => {
29
+ if (current.width === nextWidth && current.height === nextHeight) {
30
+ return current
31
+ }
32
+
33
+ return {
34
+ width: nextWidth,
35
+ height: nextHeight,
36
+ }
37
+ })
38
+ })
39
+
40
+ observer.observe(element)
41
+
42
+ return () => {
43
+ observer.disconnect()
44
+ }
45
+ }, [ref])
46
+
47
+ return size
48
+ }
src/index.css ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --font-sans: 'Space Grotesk', sans-serif;
3
+ --font-mono: 'IBM Plex Mono', monospace;
4
+ --ink-strong: #16262d;
5
+ --ink-soft: #526771;
6
+ --ink-muted: #7d8f98;
7
+ --accent-cool: #117a70;
8
+ --accent-warm: #d66225;
9
+ --panel-stroke: rgba(24, 64, 86, 0.08);
10
+ color: var(--ink-strong);
11
+ font-family: var(--font-sans);
12
+ font-synthesis: none;
13
+ font-weight: 400;
14
+ line-height: 1.5;
15
+ text-rendering: optimizeLegibility;
16
+ -moz-osx-font-smoothing: grayscale;
17
+ -webkit-font-smoothing: antialiased;
18
+ }
19
+
20
+ * {
21
+ box-sizing: border-box;
22
+ }
23
+
24
+ html {
25
+ min-width: 320px;
26
+ min-height: 100%;
27
+ }
28
+
29
+ body {
30
+ min-width: 320px;
31
+ min-height: 100vh;
32
+ margin: 0;
33
+ background:
34
+ radial-gradient(circle at 0% 0%, rgba(17, 122, 112, 0.08), transparent 24%),
35
+ radial-gradient(circle at 100% 0%, rgba(214, 98, 37, 0.1), transparent 20%),
36
+ linear-gradient(180deg, #f3f0e8 0%, #efebe0 48%, #ebe6d9 100%);
37
+ }
38
+
39
+ button,
40
+ input,
41
+ select,
42
+ textarea {
43
+ font: inherit;
44
+ }
45
+
46
+ button {
47
+ cursor: pointer;
48
+ }
49
+
50
+ a {
51
+ color: inherit;
52
+ }
53
+
54
+ #root {
55
+ min-height: 100vh;
56
+ }
src/lib/linkedFocus.ts ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export type LinkedFocus = {
2
+ source: 'transformer'
3
+ label: string
4
+ stage: number | null
5
+ tpLane: number | null
6
+ cpShard: number | null
7
+ sequenceBand: number | null
8
+ }
9
+
10
+ type FocusableGpu = {
11
+ active: boolean
12
+ stage: number
13
+ tpLane: number
14
+ cpShard: number
15
+ }
16
+
17
+ export function matchesLinkedFocus(gpu: FocusableGpu, focus: LinkedFocus | null) {
18
+ if (!focus || !gpu.active) {
19
+ return false
20
+ }
21
+
22
+ if (focus.stage !== null && gpu.stage !== focus.stage) {
23
+ return false
24
+ }
25
+
26
+ if (focus.tpLane !== null && gpu.tpLane !== focus.tpLane) {
27
+ return false
28
+ }
29
+
30
+ if (focus.cpShard !== null && gpu.cpShard !== focus.cpShard) {
31
+ return false
32
+ }
33
+
34
+ return true
35
+ }
src/lib/topologyLod.ts ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export type TopologyLodBand =
2
+ | 'overview'
3
+ | 'board'
4
+ | 'package'
5
+ | 'silicon'
6
+ | 'micro'
7
+
8
+ export type TopologyLodWeights = Record<TopologyLodBand, number>
9
+
10
+ export type TopologyLodPolicy = {
11
+ minScale: number
12
+ maxScale: number
13
+ bands: Record<
14
+ TopologyLodBand,
15
+ {
16
+ fadeInStart: number
17
+ fadeInEnd: number
18
+ fadeOutStart: number
19
+ fadeOutEnd: number
20
+ }
21
+ >
22
+ }
23
+
24
+ export type TopologyLodState = {
25
+ scale: number
26
+ primaryBand: TopologyLodBand
27
+ weights: TopologyLodWeights
28
+ deepIsolation: number
29
+ textScale: number
30
+ }
31
+
32
+ const clamp = (value: number, min: number, max: number) =>
33
+ Math.min(Math.max(value, min), max)
34
+
35
+ export const TOPOLOGY_LOD_POLICY: TopologyLodPolicy = {
36
+ minScale: 0.035,
37
+ maxScale: 420,
38
+ bands: {
39
+ overview: {
40
+ fadeInStart: 0.035,
41
+ fadeInEnd: 0.06,
42
+ fadeOutStart: 1.4,
43
+ fadeOutEnd: 3.8,
44
+ },
45
+ board: {
46
+ fadeInStart: 0.12,
47
+ fadeInEnd: 0.55,
48
+ fadeOutStart: 7,
49
+ fadeOutEnd: 18,
50
+ },
51
+ package: {
52
+ fadeInStart: 1.8,
53
+ fadeInEnd: 4.8,
54
+ fadeOutStart: 28,
55
+ fadeOutEnd: 70,
56
+ },
57
+ silicon: {
58
+ fadeInStart: 10,
59
+ fadeInEnd: 24,
60
+ fadeOutStart: 95,
61
+ fadeOutEnd: 220,
62
+ },
63
+ micro: {
64
+ fadeInStart: 36,
65
+ fadeInEnd: 95,
66
+ fadeOutStart: 420,
67
+ fadeOutEnd: 420,
68
+ },
69
+ },
70
+ }
71
+
72
+ export const saturate = (value: number) => clamp(value, 0, 1)
73
+
74
+ export const smoothstep = (value: number) => {
75
+ const safe = saturate(value)
76
+ return safe * safe * (3 - 2 * safe)
77
+ }
78
+
79
+ export const fadeBetween = (scale: number, start: number, end: number) => {
80
+ if (start === end) {
81
+ return scale >= end ? 1 : 0
82
+ }
83
+
84
+ return smoothstep((scale - start) / (end - start))
85
+ }
86
+
87
+ export const bandWeight = (
88
+ scale: number,
89
+ {
90
+ fadeInStart,
91
+ fadeInEnd,
92
+ fadeOutStart,
93
+ fadeOutEnd,
94
+ }: TopologyLodPolicy['bands'][TopologyLodBand],
95
+ ) => {
96
+ const fadeIn = fadeBetween(scale, fadeInStart, fadeInEnd)
97
+ const fadeOut =
98
+ fadeOutStart === fadeOutEnd ? 1 : 1 - fadeBetween(scale, fadeOutStart, fadeOutEnd)
99
+
100
+ return saturate(fadeIn * fadeOut)
101
+ }
102
+
103
+ const BAND_ORDER: TopologyLodBand[] = [
104
+ 'overview',
105
+ 'board',
106
+ 'package',
107
+ 'silicon',
108
+ 'micro',
109
+ ]
110
+
111
+ const createExclusiveWeights = (scale: number): TopologyLodWeights => {
112
+ const weights: TopologyLodWeights = {
113
+ overview: 0,
114
+ board: 0,
115
+ package: 0,
116
+ silicon: 0,
117
+ micro: 0,
118
+ }
119
+
120
+ const transitions = [
121
+ {
122
+ from: 'overview' as const,
123
+ to: 'board' as const,
124
+ start: TOPOLOGY_LOD_POLICY.bands.board.fadeInStart,
125
+ end: TOPOLOGY_LOD_POLICY.bands.board.fadeInEnd,
126
+ },
127
+ {
128
+ from: 'board' as const,
129
+ to: 'package' as const,
130
+ start: TOPOLOGY_LOD_POLICY.bands.package.fadeInStart,
131
+ end: TOPOLOGY_LOD_POLICY.bands.package.fadeInEnd,
132
+ },
133
+ {
134
+ from: 'package' as const,
135
+ to: 'silicon' as const,
136
+ start: TOPOLOGY_LOD_POLICY.bands.silicon.fadeInStart,
137
+ end: TOPOLOGY_LOD_POLICY.bands.silicon.fadeInEnd,
138
+ },
139
+ {
140
+ from: 'silicon' as const,
141
+ to: 'micro' as const,
142
+ start: TOPOLOGY_LOD_POLICY.bands.micro.fadeInStart,
143
+ end: TOPOLOGY_LOD_POLICY.bands.micro.fadeInEnd,
144
+ },
145
+ ]
146
+
147
+ for (let index = 0; index < transitions.length; index += 1) {
148
+ const transition = transitions[index]
149
+ const previous = transitions[index - 1]
150
+ const next = transitions[index + 1]
151
+
152
+ if (scale < transition.start) {
153
+ weights[transition.from] = 1
154
+ return weights
155
+ }
156
+
157
+ if (scale <= transition.end) {
158
+ const alpha = fadeBetween(scale, transition.start, transition.end)
159
+ weights[transition.from] = 1 - alpha
160
+ weights[transition.to] = alpha
161
+ return weights
162
+ }
163
+
164
+ if (next && scale > transition.end && scale < next.start) {
165
+ weights[transition.to] = 1
166
+ return weights
167
+ }
168
+
169
+ if (!next && scale > transition.end) {
170
+ weights.micro = 1
171
+ return weights
172
+ }
173
+
174
+ if (!previous && scale <= transition.start) {
175
+ weights.overview = 1
176
+ return weights
177
+ }
178
+ }
179
+
180
+ weights.micro = 1
181
+ return weights
182
+ }
183
+
184
+ export const getTopologyLodState = (scale: number): TopologyLodState => {
185
+ const weights = createExclusiveWeights(scale)
186
+ const primaryBand =
187
+ BAND_ORDER.find((band) => weights[band] >= 0.5) ??
188
+ (Object.entries(weights).sort((left, right) => right[1] - left[1])[0]?.[0] as
189
+ | TopologyLodBand
190
+ | undefined) ??
191
+ 'overview'
192
+
193
+ return {
194
+ scale,
195
+ primaryBand,
196
+ weights,
197
+ deepIsolation: fadeBetween(scale, 22, 80),
198
+ textScale: 1 - fadeBetween(scale, 14, 36) * 0.4,
199
+ }
200
+ }
201
+
202
+ export const screenWorld = (
203
+ scale: number,
204
+ pixels: number,
205
+ minWorld = 0.04,
206
+ maxWorld = 3.5,
207
+ ) => clamp(pixels / Math.max(scale, 0.001), minWorld, maxWorld)
208
+
209
+ export const screenStroke = (
210
+ scale: number,
211
+ pixels: number,
212
+ minWorld = 0.04,
213
+ maxWorld = 2.8,
214
+ ) => screenWorld(scale, pixels, minWorld, maxWorld)
215
+
216
+ export const mix = (from: number, to: number, alpha: number) => from + (to - from) * alpha
src/lib/topologyScene.ts ADDED
@@ -0,0 +1,980 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { type WorkbenchViewModel } from './workbenchPresenter'
2
+ import { TOPOLOGY_LOD_POLICY, type TopologyLodPolicy } from './topologyLod'
3
+
4
+ export type ViewportState = {
5
+ x: number
6
+ y: number
7
+ scale: number
8
+ }
9
+
10
+ export type HitTargetKind = 'pod' | 'node' | 'gpu' | 'link'
11
+ export type LinkKind = 'row' | 'column' | 'bus'
12
+
13
+ export type SceneHitBounds = {
14
+ x: number
15
+ y: number
16
+ width: number
17
+ height: number
18
+ }
19
+
20
+ export type SceneAnchorFrame = {
21
+ x: number
22
+ y: number
23
+ width: number
24
+ height: number
25
+ }
26
+
27
+ export type ScenePod = {
28
+ id: string
29
+ kind: 'pod'
30
+ index: number
31
+ x: number
32
+ y: number
33
+ width: number
34
+ height: number
35
+ centerX: number
36
+ centerY: number
37
+ active: boolean
38
+ load: number
39
+ thermal: number
40
+ activeGpus: number
41
+ totalGpus: number
42
+ title: string
43
+ hitBounds: SceneHitBounds
44
+ focusFrame: SceneAnchorFrame
45
+ }
46
+
47
+ export type SceneRack = {
48
+ id: string
49
+ index: number
50
+ x: number
51
+ y: number
52
+ width: number
53
+ height: number
54
+ load: number
55
+ nodeIds: string[]
56
+ hitBounds: SceneHitBounds
57
+ focusFrame: SceneAnchorFrame
58
+ }
59
+
60
+ export type SceneGpu = {
61
+ id: string
62
+ kind: 'gpu'
63
+ nodeId: string
64
+ nodeIndex: number
65
+ domainIndex: number
66
+ domainLocalIndex: number
67
+ columnIndex: number
68
+ rowIndex: number
69
+ globalIndex: number
70
+ localIndex: number
71
+ x: number
72
+ y: number
73
+ width: number
74
+ height: number
75
+ active: boolean
76
+ stage: number
77
+ tpLane: number
78
+ cpShard: number
79
+ epLane: number
80
+ dpReplica: number
81
+ replicaGroup: number
82
+ fsdpRank: number
83
+ utilization: number
84
+ linkLoad: number
85
+ memoryUsedGB: number
86
+ memoryCapacityGB: number
87
+ fillColor: number
88
+ outlineAlpha: number
89
+ title: string
90
+ hitBounds: SceneHitBounds
91
+ focusFrame: SceneAnchorFrame
92
+ lodFrame: SceneAnchorFrame
93
+ }
94
+
95
+ export type SceneNode = {
96
+ id: string
97
+ kind: 'node'
98
+ index: number
99
+ domainIndex: number
100
+ domainLocalIndex: number
101
+ columnIndex: number
102
+ rowIndex: number
103
+ x: number
104
+ y: number
105
+ width: number
106
+ height: number
107
+ hubX: number
108
+ hubY: number
109
+ busX1: number
110
+ busX2: number
111
+ activeCount: number
112
+ localFabricLoad: number
113
+ interNodeLoad: number
114
+ gpus: SceneGpu[]
115
+ hitBounds: SceneHitBounds
116
+ focusFrame: SceneAnchorFrame
117
+ }
118
+
119
+ export type SceneLink = {
120
+ id: string
121
+ kind: LinkKind
122
+ scope: 'node' | 'rack'
123
+ x1: number
124
+ y1: number
125
+ x2: number
126
+ y2: number
127
+ load: number
128
+ color: number
129
+ width: number
130
+ hitWidth: number
131
+ title: string
132
+ trafficType: 'tp' | 'pp' | 'cp' | 'fsdp' | 'ep' | 'dp'
133
+ transport: 'nvlink' | 'infiniband'
134
+ volumeGB: number
135
+ }
136
+
137
+ export type HoverTarget = {
138
+ kind: HitTargetKind
139
+ id: string
140
+ }
141
+
142
+ export type TargetDetails = {
143
+ kind: HitTargetKind
144
+ id: string
145
+ heading: string
146
+ subheading: string
147
+ metrics: Array<{ label: string; value: string }>
148
+ }
149
+
150
+ export type TopologySceneModel = {
151
+ width: number
152
+ height: number
153
+ podColumns: number
154
+ podRows: number
155
+ podWidth: number
156
+ podHeight: number
157
+ nodeWidth: number
158
+ nodeHeight: number
159
+ activePodId: string
160
+ activePodBounds: SceneHitBounds
161
+ contextualNodeCount: number
162
+ lodPolicy: TopologyLodPolicy
163
+ pods: ScenePod[]
164
+ racks: SceneRack[]
165
+ nodes: SceneNode[]
166
+ rowLinks: SceneLink[]
167
+ columnLinks: SceneLink[]
168
+ busLinks: SceneLink[]
169
+ objectCounts: {
170
+ pods: number
171
+ nodes: number
172
+ gpus: number
173
+ links: number
174
+ activeGpus: number
175
+ contextualNodes: number
176
+ }
177
+ }
178
+
179
+ const clamp = (value: number, min: number, max: number) =>
180
+ Math.min(Math.max(value, min), max)
181
+
182
+ const pointInBounds = (bounds: SceneHitBounds, x: number, y: number) =>
183
+ x >= bounds.x &&
184
+ x <= bounds.x + bounds.width &&
185
+ y >= bounds.y &&
186
+ y <= bounds.y + bounds.height
187
+
188
+ const distanceToSegment = (
189
+ x: number,
190
+ y: number,
191
+ x1: number,
192
+ y1: number,
193
+ x2: number,
194
+ y2: number,
195
+ ) => {
196
+ const a = x - x1
197
+ const b = y - y1
198
+ const c = x2 - x1
199
+ const d = y2 - y1
200
+ const dot = a * c + b * d
201
+ const length = c * c + d * d
202
+ const t = length === 0 ? 0 : clamp(dot / length, 0, 1)
203
+ const projectionX = x1 + t * c
204
+ const projectionY = y1 + t * d
205
+ const deltaX = x - projectionX
206
+ const deltaY = y - projectionY
207
+
208
+ return Math.sqrt(deltaX * deltaX + deltaY * deltaY)
209
+ }
210
+
211
+ const percent = (value: number) => `${Math.round(value * 100)}%`
212
+
213
+ const stagePalette = [0x6be5d2, 0xf2b36a, 0x8fbcff, 0xf28ac6, 0xb9e769, 0xc19cff]
214
+
215
+ const trafficColorMap: Record<SceneLink['trafficType'], number> = {
216
+ tp: 0x7ce9da,
217
+ pp: 0xf1b067,
218
+ cp: 0x77a8f1,
219
+ fsdp: 0xb0a0ff,
220
+ ep: 0xb9e769,
221
+ dp: 0xf18888,
222
+ }
223
+
224
+ function getStageColor(stageIndex: number) {
225
+ return stagePalette[stageIndex % stagePalette.length]
226
+ }
227
+
228
+ function createNodeLayout(cluster: WorkbenchViewModel['config']['cluster']) {
229
+ const nodeCount = cluster.numNodes
230
+ const nodesPerRack = cluster.nodesPerRack ?? nodeCount
231
+ const rackCount = Math.ceil(nodeCount / nodesPerRack)
232
+ const rackColumns = Math.max(1, Math.ceil(Math.sqrt(rackCount)))
233
+ const rackRows = Math.max(1, Math.ceil(rackCount / rackColumns))
234
+ const nodeColumns = Math.max(1, Math.ceil(Math.sqrt(nodesPerRack * 2)))
235
+ const nodeRows = Math.max(1, Math.ceil(nodesPerRack / nodeColumns))
236
+ const gpuColumns = cluster.gpusPerNode <= 4 ? 2 : 4
237
+ const gpuRows = Math.ceil(cluster.gpusPerNode / gpuColumns)
238
+
239
+ return {
240
+ nodesPerRack,
241
+ rackCount,
242
+ rackColumns,
243
+ rackRows,
244
+ nodeColumns,
245
+ nodeRows,
246
+ gpuColumns,
247
+ gpuRows,
248
+ }
249
+ }
250
+
251
+ export function buildTopologySceneModel(viewModel: WorkbenchViewModel): TopologySceneModel {
252
+ const { config, analysis } = viewModel
253
+ const cluster = config.cluster
254
+ const clusterPaddingX = 72
255
+ const clusterPaddingY = 72
256
+ const rackGapX = 56
257
+ const rackGapY = 58
258
+ const rackPaddingX = 20
259
+ const rackPaddingY = 24
260
+ const nodeGapX = 12
261
+ const nodeGapY = 14
262
+ const layout = createNodeLayout(cluster)
263
+ const gpuWidth = 14
264
+ const gpuHeight = 10
265
+ const gpuGapX = 6
266
+ const gpuGapY = 6
267
+ const nodeWidth =
268
+ 28 + layout.gpuColumns * gpuWidth + Math.max(layout.gpuColumns - 1, 0) * gpuGapX
269
+ const nodeHeight =
270
+ 26 + layout.gpuRows * gpuHeight + Math.max(layout.gpuRows - 1, 0) * gpuGapY
271
+ const rackWidth =
272
+ rackPaddingX * 2 +
273
+ layout.nodeColumns * nodeWidth +
274
+ Math.max(layout.nodeColumns - 1, 0) * nodeGapX
275
+ const rackHeight =
276
+ rackPaddingY * 2 +
277
+ layout.nodeRows * nodeHeight +
278
+ Math.max(layout.nodeRows - 1, 0) * nodeGapY
279
+ const width =
280
+ clusterPaddingX * 2 +
281
+ layout.rackColumns * rackWidth +
282
+ Math.max(layout.rackColumns - 1, 0) * rackGapX
283
+ const height =
284
+ clusterPaddingY * 2 +
285
+ layout.rackRows * rackHeight +
286
+ Math.max(layout.rackRows - 1, 0) * rackGapY
287
+
288
+ const pods: ScenePod[] = Array.from({ length: layout.rackCount }, (_, rackIndex) => {
289
+ const column = rackIndex % layout.rackColumns
290
+ const row = Math.floor(rackIndex / layout.rackColumns)
291
+ const x = clusterPaddingX + column * (rackWidth + rackGapX)
292
+ const y = clusterPaddingY + row * (rackHeight + rackGapY)
293
+
294
+ return {
295
+ id: `pod-${rackIndex}`,
296
+ kind: 'pod',
297
+ index: rackIndex,
298
+ x,
299
+ y,
300
+ width: rackWidth,
301
+ height: rackHeight,
302
+ centerX: x + rackWidth / 2,
303
+ centerY: y + rackHeight / 2,
304
+ active: false,
305
+ load: 0,
306
+ thermal: 0,
307
+ activeGpus: 0,
308
+ totalGpus: 0,
309
+ title: `${cluster.rackLabel ?? 'rack'} ${rackIndex + 1}`,
310
+ hitBounds: {
311
+ x,
312
+ y,
313
+ width: rackWidth,
314
+ height: rackHeight,
315
+ },
316
+ focusFrame: {
317
+ x: x + 5,
318
+ y: y + 5,
319
+ width: rackWidth - 10,
320
+ height: rackHeight - 10,
321
+ },
322
+ }
323
+ })
324
+
325
+ const nodeByIndex = new Map<number, SceneNode>()
326
+ const gpuByGlobalIndex = new Map<number, SceneGpu>()
327
+ const rawGpuMap = [...analysis.gpuMap].sort((left, right) => left.globalGPUIndex - right.globalGPUIndex)
328
+ const rawGpuByGlobalIndex = new Map(
329
+ rawGpuMap.map((gpu) => [gpu.globalGPUIndex, gpu] as const),
330
+ )
331
+
332
+ for (let nodeIndex = 0; nodeIndex < cluster.numNodes; nodeIndex += 1) {
333
+ const domainIndex = Math.floor(nodeIndex / layout.nodesPerRack)
334
+ const domainLocalIndex = nodeIndex % layout.nodesPerRack
335
+ const columnIndex = domainLocalIndex % layout.nodeColumns
336
+ const rowIndex = Math.floor(domainLocalIndex / layout.nodeColumns)
337
+ const rack = pods[domainIndex]
338
+ const x = rack.x + rackPaddingX + columnIndex * (nodeWidth + nodeGapX)
339
+ const y = rack.y + rackPaddingY + rowIndex * (nodeHeight + nodeGapY)
340
+ const hubX = x + nodeWidth / 2
341
+ const hubY = y + nodeHeight / 2
342
+ const gpuLeftInset =
343
+ (nodeWidth -
344
+ (layout.gpuColumns * gpuWidth + Math.max(layout.gpuColumns - 1, 0) * gpuGapX)) /
345
+ 2
346
+ const gpuTopInset =
347
+ (nodeHeight -
348
+ (layout.gpuRows * gpuHeight + Math.max(layout.gpuRows - 1, 0) * gpuGapY)) /
349
+ 2
350
+ const nodeGpuMap = rawGpuMap
351
+ .filter((gpu) => gpu.nodeIndex === nodeIndex)
352
+ .sort((left, right) => left.localGPUIndex - right.localGPUIndex)
353
+
354
+ const gpus: SceneGpu[] = nodeGpuMap.map((gpu) => {
355
+ const localColumn = gpu.localGPUIndex % layout.gpuColumns
356
+ const localRow = Math.floor(gpu.localGPUIndex / layout.gpuColumns)
357
+ const gpuX = x + gpuLeftInset + localColumn * (gpuWidth + gpuGapX)
358
+ const gpuY = y + gpuTopInset + localRow * (gpuHeight + gpuGapY)
359
+ const memoryUtilization = gpu.memoryCapacityGB > 0 ? gpu.memoryUsedGB / gpu.memoryCapacityGB : 0
360
+ const fillColor = gpu.memoryUsedGB > 0 ? getStageColor(Math.max(gpu.ppStage, 0)) : 0x26404d
361
+ const outlineAlpha = gpu.memoryUsedGB > 0 ? 0.3 + memoryUtilization * 0.4 : 0.14
362
+
363
+ const sceneGpu = {
364
+ id: `gpu-${gpu.globalGPUIndex}`,
365
+ kind: 'gpu' as const,
366
+ nodeId: `node-${nodeIndex}`,
367
+ nodeIndex,
368
+ domainIndex,
369
+ domainLocalIndex,
370
+ columnIndex,
371
+ rowIndex,
372
+ globalIndex: gpu.globalGPUIndex,
373
+ localIndex: gpu.localGPUIndex,
374
+ x: gpuX,
375
+ y: gpuY,
376
+ width: gpuWidth,
377
+ height: gpuHeight,
378
+ active: gpu.isActive,
379
+ stage: gpu.ppStage,
380
+ tpLane: gpu.tpLane,
381
+ cpShard: gpu.cpShard,
382
+ epLane: gpu.epLane,
383
+ dpReplica: gpu.dpReplica,
384
+ replicaGroup: gpu.replicaGroup,
385
+ fsdpRank: gpu.fsdpRank,
386
+ utilization: clamp(memoryUtilization, 0, 1),
387
+ linkLoad: 0,
388
+ memoryUsedGB: gpu.memoryUsedGB,
389
+ memoryCapacityGB: gpu.memoryCapacityGB,
390
+ fillColor,
391
+ outlineAlpha,
392
+ title: `GPU ${gpu.globalGPUIndex + 1}`,
393
+ hitBounds: {
394
+ x: gpuX - 3,
395
+ y: gpuY - 3,
396
+ width: gpuWidth + 6,
397
+ height: gpuHeight + 6,
398
+ },
399
+ focusFrame: {
400
+ x: gpuX - 0.8,
401
+ y: gpuY - 0.8,
402
+ width: gpuWidth + 1.6,
403
+ height: gpuHeight + 1.6,
404
+ },
405
+ lodFrame: {
406
+ x: gpuX,
407
+ y: gpuY,
408
+ width: gpuWidth,
409
+ height: gpuHeight,
410
+ },
411
+ }
412
+
413
+ gpuByGlobalIndex.set(gpu.globalGPUIndex, sceneGpu)
414
+ return sceneGpu
415
+ })
416
+
417
+ const node = {
418
+ id: `node-${nodeIndex}`,
419
+ kind: 'node' as const,
420
+ index: nodeIndex,
421
+ domainIndex,
422
+ domainLocalIndex,
423
+ columnIndex,
424
+ rowIndex,
425
+ x,
426
+ y,
427
+ width: nodeWidth,
428
+ height: nodeHeight,
429
+ hubX,
430
+ hubY,
431
+ busX1: x + 8,
432
+ busX2: x + nodeWidth - 8,
433
+ activeCount: gpus.filter((gpu) => gpu.memoryUsedGB > 0).length,
434
+ localFabricLoad: 0,
435
+ interNodeLoad: 0,
436
+ gpus,
437
+ hitBounds: {
438
+ x,
439
+ y,
440
+ width: nodeWidth,
441
+ height: nodeHeight,
442
+ },
443
+ focusFrame: {
444
+ x: x + 2,
445
+ y: y + 2,
446
+ width: nodeWidth - 4,
447
+ height: nodeHeight - 4,
448
+ },
449
+ }
450
+
451
+ nodeByIndex.set(nodeIndex, node)
452
+ }
453
+
454
+ const aggregateLinks = new Map<
455
+ string,
456
+ {
457
+ fromNode: number
458
+ toNode: number
459
+ trafficType: SceneLink['trafficType']
460
+ transport: SceneLink['transport']
461
+ loadSum: number
462
+ volumeSum: number
463
+ count: number
464
+ }
465
+ >()
466
+ const aggregateRackLinks = new Map<
467
+ string,
468
+ {
469
+ fromRack: number
470
+ toRack: number
471
+ trafficType: SceneLink['trafficType']
472
+ transport: SceneLink['transport']
473
+ loadSum: number
474
+ volumeSum: number
475
+ count: number
476
+ }
477
+ >()
478
+
479
+ const addAggregate = (
480
+ fromNode: number,
481
+ toNode: number,
482
+ trafficType: SceneLink['trafficType'],
483
+ transport: SceneLink['transport'],
484
+ utilizationPercent: number,
485
+ volumeGB: number,
486
+ ) => {
487
+ const ordered =
488
+ fromNode <= toNode ? [fromNode, toNode] as const : [toNode, fromNode] as const
489
+ const key = `${ordered[0]}:${ordered[1]}:${trafficType}:${transport}`
490
+ const current = aggregateLinks.get(key) ?? {
491
+ fromNode: ordered[0],
492
+ toNode: ordered[1],
493
+ trafficType,
494
+ transport,
495
+ loadSum: 0,
496
+ volumeSum: 0,
497
+ count: 0,
498
+ }
499
+
500
+ current.loadSum += utilizationPercent / 100
501
+ current.volumeSum += volumeGB
502
+ current.count += 1
503
+ aggregateLinks.set(key, current)
504
+ }
505
+
506
+ const addRackAggregate = (
507
+ fromRack: number,
508
+ toRack: number,
509
+ trafficType: SceneLink['trafficType'],
510
+ transport: SceneLink['transport'],
511
+ utilizationPercent: number,
512
+ volumeGB: number,
513
+ ) => {
514
+ const ordered =
515
+ fromRack <= toRack ? ([fromRack, toRack] as const) : ([toRack, fromRack] as const)
516
+ const key = `${ordered[0]}:${ordered[1]}:${trafficType}:${transport}`
517
+ const current = aggregateRackLinks.get(key) ?? {
518
+ fromRack: ordered[0],
519
+ toRack: ordered[1],
520
+ trafficType,
521
+ transport,
522
+ loadSum: 0,
523
+ volumeSum: 0,
524
+ count: 0,
525
+ }
526
+
527
+ current.loadSum += utilizationPercent / 100
528
+ current.volumeSum += volumeGB
529
+ current.count += 1
530
+ aggregateRackLinks.set(key, current)
531
+ }
532
+
533
+ const pushLoad = (target: Map<number, number[]>, key: number, value: number) => {
534
+ target.set(key, [...(target.get(key) ?? []), value])
535
+ }
536
+
537
+ for (const link of analysis.links) {
538
+ const fromGpu = rawGpuByGlobalIndex.get(link.fromGPU)
539
+ const toGpu = rawGpuByGlobalIndex.get(link.toGPU)
540
+
541
+ if (!fromGpu || !toGpu) {
542
+ continue
543
+ }
544
+
545
+ const fromRack = Math.floor(fromGpu.nodeIndex / layout.nodesPerRack)
546
+ const toRack = Math.floor(toGpu.nodeIndex / layout.nodesPerRack)
547
+
548
+ if (fromRack !== toRack) {
549
+ addRackAggregate(
550
+ fromRack,
551
+ toRack,
552
+ link.trafficType,
553
+ link.type,
554
+ link.utilizationPercent,
555
+ link.volumeGB,
556
+ )
557
+ continue
558
+ }
559
+
560
+ addAggregate(
561
+ fromGpu.nodeIndex,
562
+ toGpu.nodeIndex,
563
+ link.trafficType,
564
+ link.type,
565
+ link.utilizationPercent,
566
+ link.volumeGB,
567
+ )
568
+ }
569
+
570
+ const rowLinks: SceneLink[] = []
571
+ const columnLinks: SceneLink[] = []
572
+ const busLinks: SceneLink[] = []
573
+ const localLoads = new Map<number, number[]>()
574
+ const interLoads = new Map<number, number[]>()
575
+
576
+ for (const aggregate of aggregateLinks.values()) {
577
+ const averageLoad = aggregate.count > 0 ? aggregate.loadSum / aggregate.count : 0
578
+ const averageVolume = aggregate.count > 0 ? aggregate.volumeSum / aggregate.count : 0
579
+ const color = trafficColorMap[aggregate.trafficType]
580
+
581
+ if (aggregate.fromNode === aggregate.toNode) {
582
+ const node = nodeByIndex.get(aggregate.fromNode)
583
+ if (!node) {
584
+ continue
585
+ }
586
+
587
+ const offset = busLinks.filter((link) => link.id.startsWith(`bus-${node.index}`)).length * 3
588
+ busLinks.push({
589
+ id: `bus-${node.index}-${aggregate.trafficType}`,
590
+ kind: 'bus',
591
+ scope: 'node',
592
+ x1: node.busX1,
593
+ y1: node.hubY + offset,
594
+ x2: node.busX2,
595
+ y2: node.hubY + offset,
596
+ load: averageLoad,
597
+ color,
598
+ width: 0.9 + averageLoad * 2,
599
+ hitWidth: 10,
600
+ title: `${aggregate.trafficType.toUpperCase()} ${aggregate.transport} on ${cluster.nodeLabel ?? 'node'} ${node.index + 1}`,
601
+ trafficType: aggregate.trafficType,
602
+ transport: aggregate.transport,
603
+ volumeGB: roundVolume(averageVolume),
604
+ })
605
+ pushLoad(localLoads, node.index, averageLoad)
606
+ continue
607
+ }
608
+
609
+ const fromNode = nodeByIndex.get(aggregate.fromNode)
610
+ const toNode = nodeByIndex.get(aggregate.toNode)
611
+ if (!fromNode || !toNode) {
612
+ continue
613
+ }
614
+
615
+ const sceneLink = {
616
+ id: `link-${aggregate.fromNode}-${aggregate.toNode}-${aggregate.trafficType}`,
617
+ kind:
618
+ Math.abs(fromNode.hubX - toNode.hubX) >= Math.abs(fromNode.hubY - toNode.hubY)
619
+ ? ('row' as const)
620
+ : ('column' as const),
621
+ scope: 'node' as const,
622
+ x1: fromNode.hubX,
623
+ y1: fromNode.hubY,
624
+ x2: toNode.hubX,
625
+ y2: toNode.hubY,
626
+ load: averageLoad,
627
+ color,
628
+ width: 1 + averageLoad * 2.6,
629
+ hitWidth: aggregate.transport === 'infiniband' ? 18 : 12,
630
+ title:
631
+ `${aggregate.trafficType.toUpperCase()} ${aggregate.transport} between ` +
632
+ `${cluster.nodeLabel ?? 'node'} ${fromNode.index + 1} and ${cluster.nodeLabel ?? 'node'} ${toNode.index + 1}`,
633
+ trafficType: aggregate.trafficType,
634
+ transport: aggregate.transport,
635
+ volumeGB: roundVolume(averageVolume),
636
+ }
637
+
638
+ if (sceneLink.kind === 'row') {
639
+ rowLinks.push(sceneLink)
640
+ } else {
641
+ columnLinks.push(sceneLink)
642
+ }
643
+
644
+ const targetMap = aggregate.transport === 'nvlink' ? localLoads : interLoads
645
+ pushLoad(targetMap, fromNode.index, averageLoad)
646
+ pushLoad(targetMap, toNode.index, averageLoad)
647
+ }
648
+
649
+ for (const aggregate of aggregateRackLinks.values()) {
650
+ const averageLoad = aggregate.count > 0 ? aggregate.loadSum / aggregate.count : 0
651
+ const averageVolume = aggregate.count > 0 ? aggregate.volumeSum / aggregate.count : 0
652
+ const color = trafficColorMap[aggregate.trafficType]
653
+ const fromRack = pods[aggregate.fromRack]
654
+ const toRack = pods[aggregate.toRack]
655
+
656
+ if (!fromRack || !toRack) {
657
+ continue
658
+ }
659
+
660
+ const sceneLink = {
661
+ id: `rack-link-${aggregate.fromRack}-${aggregate.toRack}-${aggregate.trafficType}`,
662
+ kind:
663
+ Math.abs(fromRack.centerX - toRack.centerX) >= Math.abs(fromRack.centerY - toRack.centerY)
664
+ ? ('row' as const)
665
+ : ('column' as const),
666
+ scope: 'rack' as const,
667
+ x1: fromRack.centerX,
668
+ y1: fromRack.centerY,
669
+ x2: toRack.centerX,
670
+ y2: toRack.centerY,
671
+ load: averageLoad,
672
+ color,
673
+ width: 1.6 + averageLoad * 3.2,
674
+ hitWidth: 22,
675
+ title:
676
+ `${aggregate.trafficType.toUpperCase()} ${aggregate.transport} between ` +
677
+ `${cluster.rackLabel ?? 'rack'} ${aggregate.fromRack + 1} and ${cluster.rackLabel ?? 'rack'} ${aggregate.toRack + 1}`,
678
+ trafficType: aggregate.trafficType,
679
+ transport: aggregate.transport,
680
+ volumeGB: roundVolume(averageVolume),
681
+ }
682
+
683
+ if (sceneLink.kind === 'row') {
684
+ rowLinks.push(sceneLink)
685
+ } else {
686
+ columnLinks.push(sceneLink)
687
+ }
688
+
689
+ for (const node of nodeByIndex.values()) {
690
+ if (node.domainIndex === aggregate.fromRack || node.domainIndex === aggregate.toRack) {
691
+ pushLoad(interLoads, node.index, averageLoad)
692
+ }
693
+ }
694
+ }
695
+
696
+ for (const node of nodeByIndex.values()) {
697
+ node.localFabricLoad = average(localLoads.get(node.index) ?? [])
698
+ node.interNodeLoad = average(interLoads.get(node.index) ?? [])
699
+
700
+ for (const gpu of node.gpus) {
701
+ gpu.linkLoad = clamp(node.localFabricLoad * 0.7 + node.interNodeLoad * 0.6, 0, 1)
702
+ }
703
+ }
704
+
705
+ for (const pod of pods) {
706
+ const rackNodes = Array.from(nodeByIndex.values()).filter((node) => node.domainIndex === pod.index)
707
+ pod.load = average(rackNodes.map((node) => node.interNodeLoad))
708
+ pod.thermal = average(rackNodes.map((node) => node.localFabricLoad))
709
+ pod.activeGpus = rackNodes.reduce((sum, node) => sum + node.gpus.filter((gpu) => gpu.active).length, 0)
710
+ pod.totalGpus = rackNodes.reduce((sum, node) => sum + node.gpus.length, 0)
711
+ }
712
+
713
+ const activePod =
714
+ [...pods].sort((left, right) => right.activeGpus - left.activeGpus)[0] ?? pods[0]
715
+
716
+ for (const pod of pods) {
717
+ pod.active = pod.id === activePod?.id
718
+ }
719
+
720
+ const racks: SceneRack[] = pods.map((pod) => ({
721
+ id: `rack-${pod.index}`,
722
+ index: pod.index,
723
+ x: pod.x,
724
+ y: pod.y,
725
+ width: pod.width,
726
+ height: pod.height,
727
+ load: pod.load,
728
+ nodeIds: Array.from(nodeByIndex.values())
729
+ .filter((node) => node.domainIndex === pod.index)
730
+ .map((node) => node.id),
731
+ hitBounds: pod.hitBounds,
732
+ focusFrame: pod.focusFrame,
733
+ }))
734
+
735
+ const nodes = [...nodeByIndex.values()].sort((left, right) => left.index - right.index)
736
+
737
+ return {
738
+ width,
739
+ height,
740
+ podColumns: layout.rackColumns,
741
+ podRows: layout.rackRows,
742
+ podWidth: rackWidth,
743
+ podHeight: rackHeight,
744
+ nodeWidth,
745
+ nodeHeight,
746
+ activePodId: activePod?.id ?? 'pod-0',
747
+ activePodBounds: activePod?.hitBounds ?? {
748
+ x: 0,
749
+ y: 0,
750
+ width,
751
+ height,
752
+ },
753
+ contextualNodeCount: cluster.numNodes,
754
+ lodPolicy: TOPOLOGY_LOD_POLICY,
755
+ pods,
756
+ racks,
757
+ nodes,
758
+ rowLinks,
759
+ columnLinks,
760
+ busLinks,
761
+ objectCounts: {
762
+ pods: pods.length,
763
+ nodes: nodes.length,
764
+ gpus: nodes.reduce((sum, node) => sum + node.gpus.length, 0),
765
+ links: rowLinks.length + columnLinks.length + busLinks.length,
766
+ activeGpus: analysis.gpuMap.filter((gpu) => gpu.isActive).length,
767
+ contextualNodes: cluster.numNodes,
768
+ },
769
+ }
770
+ }
771
+
772
+ function average(values: number[]) {
773
+ if (values.length === 0) {
774
+ return 0
775
+ }
776
+
777
+ return values.reduce((sum, value) => sum + value, 0) / values.length
778
+ }
779
+
780
+ function roundVolume(value: number) {
781
+ return Math.round(value * 100) / 100
782
+ }
783
+
784
+ export function findHoverTarget(
785
+ model: TopologySceneModel,
786
+ x: number,
787
+ y: number,
788
+ ): HoverTarget | null {
789
+ for (const node of model.nodes) {
790
+ for (const gpu of node.gpus) {
791
+ if (pointInBounds(gpu.hitBounds, x, y)) {
792
+ return {
793
+ kind: 'gpu',
794
+ id: gpu.id,
795
+ }
796
+ }
797
+ }
798
+ }
799
+
800
+ for (const node of model.nodes) {
801
+ if (pointInBounds(node.hitBounds, x, y)) {
802
+ return {
803
+ kind: 'node',
804
+ id: node.id,
805
+ }
806
+ }
807
+ }
808
+
809
+ for (const pod of model.pods) {
810
+ if (pointInBounds(pod.hitBounds, x, y)) {
811
+ return {
812
+ kind: 'pod',
813
+ id: pod.id,
814
+ }
815
+ }
816
+ }
817
+
818
+ const links = [...model.rowLinks, ...model.columnLinks, ...model.busLinks]
819
+ for (const link of links) {
820
+ const distance = distanceToSegment(x, y, link.x1, link.y1, link.x2, link.y2)
821
+ if (distance <= link.hitWidth / 2) {
822
+ return {
823
+ kind: 'link',
824
+ id: link.id,
825
+ }
826
+ }
827
+ }
828
+
829
+ return null
830
+ }
831
+
832
+ export function describeTarget(
833
+ model: TopologySceneModel,
834
+ viewModel: WorkbenchViewModel,
835
+ target: HoverTarget | null,
836
+ ): TargetDetails | null {
837
+ if (!target) {
838
+ return null
839
+ }
840
+
841
+ const rackLabel = viewModel.config.cluster.rackLabel ?? 'rack'
842
+ const nodeLabel = viewModel.config.cluster.nodeLabel ?? 'node'
843
+
844
+ if (target.kind === 'pod') {
845
+ const pod = model.pods.find((item) => item.id === target.id)
846
+ if (!pod) {
847
+ return null
848
+ }
849
+
850
+ return {
851
+ kind: 'pod',
852
+ id: pod.id,
853
+ heading: `${rackLabel} ${pod.index + 1}`,
854
+ subheading: `${pod.totalGpus} GPUs laid out across ${Math.ceil(pod.totalGpus / viewModel.config.cluster.gpusPerNode)} ${nodeLabel}s.`,
855
+ metrics: [
856
+ { label: 'Active GPUs', value: String(pod.activeGpus) },
857
+ { label: 'Rack-local load', value: percent(pod.thermal) },
858
+ { label: 'Scale-out load', value: percent(pod.load) },
859
+ { label: 'Rack capacity', value: `${pod.totalGpus} GPUs` },
860
+ ],
861
+ }
862
+ }
863
+
864
+ if (target.kind === 'gpu') {
865
+ const gpu = model.nodes.flatMap((node) => node.gpus).find((item) => item.id === target.id)
866
+ if (!gpu) {
867
+ return null
868
+ }
869
+
870
+ return {
871
+ kind: 'gpu',
872
+ id: gpu.id,
873
+ heading: `GPU ${gpu.globalIndex + 1}`,
874
+ subheading:
875
+ `${rackLabel} ${gpu.domainIndex + 1} · ${nodeLabel} ${gpu.domainLocalIndex + 1} · ` +
876
+ `slot ${gpu.localIndex + 1}`,
877
+ metrics: [
878
+ { label: 'Stage', value: gpu.memoryUsedGB > 0 ? `P${gpu.stage + 1}` : 'idle' },
879
+ { label: 'Tensor lane', value: gpu.memoryUsedGB > 0 ? `T${gpu.tpLane + 1}` : 'idle' },
880
+ { label: 'Context shard', value: gpu.memoryUsedGB > 0 ? `C${gpu.cpShard + 1}` : 'idle' },
881
+ { label: 'Expert lane', value: gpu.memoryUsedGB > 0 ? `E${gpu.epLane + 1}` : 'idle' },
882
+ { label: 'Data replica', value: gpu.memoryUsedGB > 0 ? `D${gpu.dpReplica + 1}` : 'idle' },
883
+ { label: 'Replica group', value: gpu.memoryUsedGB > 0 ? `G${gpu.replicaGroup + 1}` : 'idle' },
884
+ { label: 'FSDP rank', value: gpu.memoryUsedGB > 0 ? `F${gpu.fsdpRank + 1}` : 'idle' },
885
+ { label: 'HBM', value: `${gpu.memoryUsedGB.toFixed(1)} / ${gpu.memoryCapacityGB.toFixed(0)} GB` },
886
+ { label: 'Link load', value: percent(gpu.linkLoad) },
887
+ ],
888
+ }
889
+ }
890
+
891
+ if (target.kind === 'node') {
892
+ const node = model.nodes.find((item) => item.id === target.id)
893
+ if (!node) {
894
+ return null
895
+ }
896
+
897
+ return {
898
+ kind: 'node',
899
+ id: node.id,
900
+ heading: `${nodeLabel} ${node.domainLocalIndex + 1}`,
901
+ subheading: `${rackLabel} ${node.domainIndex + 1} · ${node.activeCount}/${viewModel.config.cluster.gpusPerNode} GPUs allocated`,
902
+ metrics: [
903
+ { label: 'Active GPUs', value: String(node.activeCount) },
904
+ { label: 'Rack-local load', value: percent(node.localFabricLoad) },
905
+ { label: 'Scale-out load', value: percent(node.interNodeLoad) },
906
+ { label: 'Node size', value: `${viewModel.config.cluster.gpusPerNode} GPUs` },
907
+ ],
908
+ }
909
+ }
910
+
911
+ const link = [...model.rowLinks, ...model.columnLinks, ...model.busLinks].find(
912
+ (item) => item.id === target.id,
913
+ )
914
+ if (!link) {
915
+ return null
916
+ }
917
+
918
+ return {
919
+ kind: 'link',
920
+ id: link.id,
921
+ heading: link.title,
922
+ subheading:
923
+ link.scope === 'rack'
924
+ ? 'Inter-rack fabric segment'
925
+ : link.transport === 'nvlink'
926
+ ? 'Local high-bandwidth GPU fabric'
927
+ : 'Rack-local fabric segment',
928
+ metrics: [
929
+ { label: 'Traffic', value: link.trafficType.toUpperCase() },
930
+ { label: 'Transport', value: link.transport },
931
+ { label: 'Utilization', value: percent(link.load) },
932
+ { label: 'Volume', value: `${link.volumeGB.toFixed(2)} GB` },
933
+ ],
934
+ }
935
+ }
936
+
937
+ export function getBoundsViewport(
938
+ bounds: SceneHitBounds,
939
+ width: number,
940
+ height: number,
941
+ padding = 26,
942
+ ): ViewportState {
943
+ const scale = clamp(
944
+ Math.min((width - padding * 2) / bounds.width, (height - padding * 2) / bounds.height),
945
+ 0.04,
946
+ 32,
947
+ )
948
+
949
+ return {
950
+ scale,
951
+ x: (width - bounds.width * scale) / 2 - bounds.x * scale,
952
+ y: (height - bounds.height * scale) / 2 - bounds.y * scale,
953
+ }
954
+ }
955
+
956
+ export function getFitViewport(
957
+ model: TopologySceneModel,
958
+ width: number,
959
+ height: number,
960
+ ): ViewportState {
961
+ return getBoundsViewport(
962
+ {
963
+ x: 0,
964
+ y: 0,
965
+ width: model.width,
966
+ height: model.height,
967
+ },
968
+ width,
969
+ height,
970
+ )
971
+ }
972
+
973
+ export function worldToScreen(bounds: SceneHitBounds, viewport: ViewportState) {
974
+ return {
975
+ x: bounds.x * viewport.scale + viewport.x,
976
+ y: bounds.y * viewport.scale + viewport.y,
977
+ width: bounds.width * viewport.scale,
978
+ height: bounds.height * viewport.scale,
979
+ }
980
+ }
src/lib/trainingClusterModel.ts ADDED
@@ -0,0 +1,1882 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export interface ModelConfig {
2
+ architecture: 'dense' | 'moe'
3
+ hiddenDim: number
4
+ numLayers: number
5
+ numHeads: number
6
+ numKVHeads: number
7
+ vocabSize: number
8
+ intermediateSize: number
9
+ tiedEmbeddings: boolean
10
+ attentionProfile?: {
11
+ type: 'full' | 'hybrid'
12
+ slidingWindowSize?: number
13
+ globalAttentionFraction?: number
14
+ globalAttentionEveryN?: number
15
+ }
16
+ moe?: {
17
+ numExperts: number
18
+ expertsPerToken: number
19
+ numDenseLayers: number
20
+ expertIntermediateSize: number
21
+ activeParamsPerToken?: number
22
+ }
23
+ }
24
+
25
+ export interface TrainingConfig {
26
+ microBatchSize: number
27
+ seqLength: number
28
+ gradAccumSteps: number
29
+ precision: 'fp32' | 'bf16' | 'fp16' | 'fp8'
30
+ activationCheckpointing: boolean
31
+ optimizer: 'adam' | 'adamw' | 'sgd' | 'muon'
32
+ }
33
+
34
+ export interface GPUSpec {
35
+ name: string
36
+ hbmCapacityGB: number
37
+ peakTFLOPsBF16: number
38
+ memBandwidthTBs: number
39
+ }
40
+
41
+ export interface ClusterConfig {
42
+ gpuType: GPUSpec
43
+ gpusPerNode: number
44
+ numNodes: number
45
+ intraNodeBandwidthGBs: number
46
+ interNodeBandwidthGBs: number
47
+ nodesPerRack?: number
48
+ rackLabel?: string
49
+ nodeLabel?: string
50
+ podLabel?: string
51
+ }
52
+
53
+ export interface ParallelismConfig {
54
+ tp: number
55
+ pp: number
56
+ cp: number
57
+ ep: number
58
+ distributedOptimizer: boolean
59
+ fsdpShardGroupSize: number
60
+ zeroStage: 0 | 1 | 2 | 3
61
+ }
62
+
63
+ export interface ClusterAnalysis {
64
+ feasible: boolean
65
+ infeasibilityReason?: string
66
+ totalParams: number
67
+ activeParamsPerToken: number
68
+ globalBatchSizeTokens: number
69
+ totalGPUs: number
70
+ derivedParallelism: {
71
+ dp: number
72
+ replicaGroups: number
73
+ fsdpShardGroupSize: number
74
+ fsdpGroupSize: number
75
+ ep: number
76
+ }
77
+ memoryBreakdown: {
78
+ parametersGB: number
79
+ optimizerStatesGB: number
80
+ gradientsGB: number
81
+ activationsGB: number
82
+ totalGB: number
83
+ hbmCapacityGB: number
84
+ utilizationPercent: number
85
+ }
86
+ pipelineStages: {
87
+ stageIndex: number
88
+ layerRange: [number, number]
89
+ numLayers: number
90
+ memoryGB: number
91
+ hasEmbedding: boolean
92
+ hasOutputHead: boolean
93
+ }[]
94
+ communication: {
95
+ tp: {
96
+ allReducesPerLayer: number
97
+ messageSizeBytes: number
98
+ totalVolumePerStepGB: number
99
+ timePerStepMs: number
100
+ linkUtilizationPercent: number
101
+ }
102
+ pp: {
103
+ activationMessageSizeBytes: number
104
+ numP2PTransfersPerStep: number
105
+ totalVolumePerStepGB: number
106
+ timePerStepMs: number
107
+ usesInterNode: boolean
108
+ }
109
+ cp: {
110
+ collectivesPerLayer: number
111
+ messageSizeBytes: number
112
+ totalVolumePerStepGB: number
113
+ timePerStepMs: number
114
+ linkUtilizationPercent: number
115
+ usesInterNode: boolean
116
+ }
117
+ fsdp: {
118
+ collectivesPerLayer: number
119
+ messageSizeBytes: number
120
+ totalVolumePerStepGB: number
121
+ timePerStepMs: number
122
+ linkUtilizationPercent: number
123
+ usesInterNode: boolean
124
+ }
125
+ ep: {
126
+ allToAllsPerLayer: number
127
+ messageSizeBytes: number
128
+ totalVolumePerStepGB: number
129
+ timePerStepMs: number
130
+ linkUtilizationPercent: number
131
+ usesInterNode: boolean
132
+ }
133
+ dp: {
134
+ gradientVolumePerGPU_GB: number
135
+ allReduceTimeMs: number
136
+ canOverlapWithBackward: boolean
137
+ linkUtilizationPercent: number
138
+ }
139
+ }
140
+ throughput: {
141
+ computeTimePerStepMs: number
142
+ communicationTimePerStepMs: number
143
+ pipelineBubbleFraction: number
144
+ pipelineBubbleTimeMs: number
145
+ totalStepTimeMs: number
146
+ tokensPerSecond: number
147
+ mfu: number
148
+ }
149
+ gpuMap: {
150
+ globalGPUIndex: number
151
+ nodeIndex: number
152
+ localGPUIndex: number
153
+ tpGroup: number
154
+ tpLane: number
155
+ ppStage: number
156
+ cpShard: number
157
+ epLane: number
158
+ dpReplica: number
159
+ replicaGroup: number
160
+ fsdpRank: number
161
+ memoryUsedGB: number
162
+ memoryCapacityGB: number
163
+ isActive: boolean
164
+ }[]
165
+ links: {
166
+ fromGPU: number
167
+ toGPU: number
168
+ type: 'nvlink' | 'infiniband'
169
+ trafficType: 'tp' | 'pp' | 'cp' | 'fsdp' | 'ep' | 'dp'
170
+ volumeGB: number
171
+ utilizationPercent: number
172
+ }[]
173
+ }
174
+
175
+ type LayerDistribution = {
176
+ stageIndex: number
177
+ startLayer: number
178
+ endLayer: number
179
+ numLayers: number
180
+ }
181
+
182
+ type StageMemory = {
183
+ parametersGB: number
184
+ optimizerStatesGB: number
185
+ gradientsGB: number
186
+ activationsGB: number
187
+ totalGB: number
188
+ }
189
+
190
+ type StageParameterCount = {
191
+ stageParams: number
192
+ sharedParams: number
193
+ expertParams: number
194
+ denseLayers: number
195
+ moeLayers: number
196
+ hasEmbedding: boolean
197
+ hasOutputHead: boolean
198
+ }
199
+
200
+ type PlacementEntry = {
201
+ globalGPUIndex: number
202
+ nodeIndex: number
203
+ localGPUIndex: number
204
+ tpGroup: number
205
+ tpLane: number
206
+ ppStage: number
207
+ cpShard: number
208
+ epLane: number
209
+ dpReplica: number
210
+ replicaGroup: number
211
+ fsdpRank: number
212
+ isActive: boolean
213
+ }
214
+
215
+ type DerivedParallelism = {
216
+ modelParallelSize: number
217
+ dp: number
218
+ replicaGroups: number
219
+ fsdpGroupSize: number
220
+ fsdpDataParallelDegree: number
221
+ }
222
+
223
+ type ModelBreakdown = ReturnType<typeof getModelBreakdown>
224
+
225
+ type RingCommStats = {
226
+ volumeBytesPerGpu: number
227
+ totalVolumeBytes: number
228
+ timePerStepMs: number
229
+ linkUtilizationPercent: number
230
+ usesInterNode: boolean
231
+ }
232
+
233
+ const BYTES_PER_GB = 1e9
234
+ const TP_ALL_REDUCES_PER_LAYER = 4
235
+ const CP_COLLECTIVES_PER_LAYER = 2
236
+ const FSDP_COLLECTIVES_PER_LAYER = 4
237
+ const EP_ALL_TO_ALLS_PER_LAYER = 2
238
+ const DEFAULT_BF16_EFFICIENCY = 0.56
239
+
240
+ const clamp = (value: number, min: number, max: number) =>
241
+ Math.min(Math.max(value, min), max)
242
+
243
+ const bytesToGB = (bytes: number) => bytes / BYTES_PER_GB
244
+
245
+ const round2 = (value: number) => Math.round(value * 100) / 100
246
+
247
+ const getParameterBytes = (precision: TrainingConfig['precision']) => {
248
+ switch (precision) {
249
+ case 'fp32':
250
+ return 4
251
+ case 'fp8':
252
+ return 1
253
+ default:
254
+ return 2
255
+ }
256
+ }
257
+
258
+ const getActivationBytes = (precision: TrainingConfig['precision']) =>
259
+ precision === 'fp32' ? 4 : 2
260
+
261
+ const getGradientBytes = (precision: TrainingConfig['precision']) =>
262
+ precision === 'fp32' ? 4 : 2
263
+
264
+ const getOptimizerBytesPerParam = (
265
+ optimizer: TrainingConfig['optimizer'],
266
+ precision: TrainingConfig['precision'],
267
+ ) => {
268
+ if (optimizer === 'sgd') {
269
+ return 4
270
+ }
271
+
272
+ // Muon keeps lower optimizer state than Adam-family optimizers in practice.
273
+ // We model it as 8 bytes per parameter of extra state on top of bf16 weights.
274
+ if (optimizer === 'muon') {
275
+ return 8
276
+ }
277
+
278
+ return precision === 'fp32' ? 8 : 12
279
+ }
280
+
281
+ const getPeakTFLOPsForPrecision = (gpu: GPUSpec, precision: TrainingConfig['precision']) => {
282
+ switch (precision) {
283
+ case 'fp32':
284
+ return gpu.peakTFLOPsBF16 * 0.25
285
+ case 'fp8':
286
+ return gpu.peakTFLOPsBF16 * 2
287
+ default:
288
+ return gpu.peakTFLOPsBF16
289
+ }
290
+ }
291
+
292
+ const getSustainedComputeEfficiency = (training: TrainingConfig) => {
293
+ const checkpointPenalty = training.activationCheckpointing ? 0.02 : 0
294
+ const fp32Penalty = training.precision === 'fp32' ? 0.08 : 0
295
+ const moeBoost = training.optimizer === 'muon' ? 0.02 : 0
296
+
297
+ return clamp(DEFAULT_BF16_EFFICIENCY - checkpointPenalty - fp32Penalty + moeBoost, 0.3, 0.62)
298
+ }
299
+
300
+ const distributeLayers = (numLayers: number, pp: number): LayerDistribution[] => {
301
+ const baseLayers = Math.floor(numLayers / pp)
302
+ const remainder = numLayers % pp
303
+ let startLayer = 0
304
+
305
+ return Array.from({ length: pp }, (_, stageIndex) => {
306
+ const stageLayers = baseLayers + (stageIndex < remainder ? 1 : 0)
307
+ const endLayer = startLayer + stageLayers - 1
308
+ const distribution = {
309
+ stageIndex,
310
+ startLayer,
311
+ endLayer,
312
+ numLayers: stageLayers,
313
+ }
314
+
315
+ startLayer += stageLayers
316
+ return distribution
317
+ })
318
+ }
319
+
320
+ const getDefaultFabric = (gpu: GPUSpec) => {
321
+ const normalizedName = gpu.name.toLowerCase()
322
+
323
+ if (normalizedName.includes('gb200')) {
324
+ return {
325
+ intraNodeBandwidthGBs: 900,
326
+ interNodeBandwidthGBs: 100,
327
+ }
328
+ }
329
+
330
+ if (normalizedName.includes('h100')) {
331
+ return {
332
+ intraNodeBandwidthGBs: 450,
333
+ interNodeBandwidthGBs: 100,
334
+ }
335
+ }
336
+
337
+ return {
338
+ intraNodeBandwidthGBs: 300,
339
+ interNodeBandwidthGBs: 50,
340
+ }
341
+ }
342
+
343
+ const getModelBreakdown = (model: ModelConfig) => {
344
+ const headDim = model.hiddenDim / model.numHeads
345
+ const embeddingParams = model.vocabSize * model.hiddenDim
346
+ const kvProjectionDim = model.numKVHeads * headDim
347
+
348
+ const perLayerAttentionParams =
349
+ model.hiddenDim * (model.hiddenDim + 2 * kvProjectionDim + model.hiddenDim)
350
+ const perLayerDenseMlpParams = model.hiddenDim * model.intermediateSize * 3
351
+ const perLayerNormParams = model.hiddenDim * 2
352
+ const finalNormParams = model.hiddenDim
353
+ const outputHeadParams = model.tiedEmbeddings ? 0 : embeddingParams
354
+ const perExpertParams =
355
+ model.architecture === 'moe' && model.moe
356
+ ? model.hiddenDim * model.moe.expertIntermediateSize * 3
357
+ : 0
358
+ const totalExpertParamsPerLayer =
359
+ model.architecture === 'moe' && model.moe ? perExpertParams * model.moe.numExperts : 0
360
+ const denseLayerCount =
361
+ model.architecture === 'moe' && model.moe ? model.moe.numDenseLayers : model.numLayers
362
+ const moeLayerCount = model.numLayers - denseLayerCount
363
+ const sharedDenseLayerParams =
364
+ perLayerAttentionParams + perLayerDenseMlpParams + perLayerNormParams
365
+ const sharedMoeLayerParams = perLayerAttentionParams + perLayerNormParams
366
+ const sharedParams =
367
+ embeddingParams +
368
+ denseLayerCount * sharedDenseLayerParams +
369
+ moeLayerCount * sharedMoeLayerParams +
370
+ finalNormParams +
371
+ outputHeadParams
372
+ const totalParams = sharedParams + moeLayerCount * totalExpertParamsPerLayer
373
+ const derivedActiveParams =
374
+ model.architecture === 'moe' && model.moe
375
+ ? embeddingParams +
376
+ denseLayerCount * sharedDenseLayerParams +
377
+ moeLayerCount *
378
+ (sharedMoeLayerParams + model.moe.expertsPerToken * perExpertParams) +
379
+ finalNormParams +
380
+ outputHeadParams
381
+ : totalParams
382
+ const activeParamsPerToken =
383
+ model.architecture === 'moe' && model.moe?.activeParamsPerToken != null
384
+ ? model.moe.activeParamsPerToken
385
+ : derivedActiveParams
386
+
387
+ const perLayerTotalParams =
388
+ model.architecture === 'moe'
389
+ ? sharedMoeLayerParams + totalExpertParamsPerLayer
390
+ : sharedDenseLayerParams
391
+
392
+ return {
393
+ headDim,
394
+ kvProjectionDim,
395
+ embeddingParams,
396
+ perLayerAttentionParams,
397
+ perLayerDenseMlpParams,
398
+ perLayerNormParams,
399
+ perExpertParams,
400
+ totalExpertParamsPerLayer,
401
+ sharedDenseLayerParams,
402
+ sharedMoeLayerParams,
403
+ denseLayerCount,
404
+ moeLayerCount,
405
+ sharedParams,
406
+ perLayerTotalParams,
407
+ finalNormParams,
408
+ outputHeadParams,
409
+ totalParams,
410
+ activeParamsPerToken,
411
+ }
412
+ }
413
+
414
+ const getConcurrentMicroBatches = (
415
+ training: TrainingConfig,
416
+ parallelism: ParallelismConfig,
417
+ ) => {
418
+ if (parallelism.pp <= 1) {
419
+ return 1
420
+ }
421
+
422
+ return Math.max(1, Math.min(training.gradAccumSteps, parallelism.pp))
423
+ }
424
+
425
+ const getAttentionMultiplier = (model: ModelConfig, seqLength: number) => {
426
+ const profile = model.attentionProfile
427
+ if (!profile || profile.type === 'full') {
428
+ return 1
429
+ }
430
+
431
+ const windowMultiplier =
432
+ profile.slidingWindowSize != null
433
+ ? clamp(profile.slidingWindowSize / seqLength, 0, 1)
434
+ : 1
435
+ const globalFraction =
436
+ profile.globalAttentionFraction ??
437
+ (profile.globalAttentionEveryN != null ? 1 / profile.globalAttentionEveryN : 0.25)
438
+
439
+ return clamp(globalFraction + (1 - globalFraction) * windowMultiplier, windowMultiplier, 1)
440
+ }
441
+
442
+ const getStageLayerMix = (stage: LayerDistribution, model: ModelConfig) => {
443
+ if (model.architecture !== 'moe' || !model.moe) {
444
+ return {
445
+ denseLayers: stage.numLayers,
446
+ moeLayers: 0,
447
+ }
448
+ }
449
+
450
+ const denseEnd = model.moe.numDenseLayers - 1
451
+ const denseLayers =
452
+ denseEnd < stage.startLayer
453
+ ? 0
454
+ : Math.max(0, Math.min(stage.endLayer, denseEnd) - stage.startLayer + 1)
455
+
456
+ return {
457
+ denseLayers,
458
+ moeLayers: stage.numLayers - denseLayers,
459
+ }
460
+ }
461
+
462
+ const getStageParameterCount = (
463
+ stage: LayerDistribution,
464
+ modelBreakdown: ModelBreakdown,
465
+ parallelism: ParallelismConfig,
466
+ model: ModelConfig,
467
+ ): StageParameterCount => {
468
+ const layerMix = getStageLayerMix(stage, model)
469
+ let sharedParams =
470
+ layerMix.denseLayers * modelBreakdown.sharedDenseLayerParams +
471
+ layerMix.moeLayers * modelBreakdown.sharedMoeLayerParams
472
+ const expertParams = layerMix.moeLayers * modelBreakdown.totalExpertParamsPerLayer
473
+ const hasEmbedding = stage.stageIndex === 0
474
+ const hasOutputHead = stage.stageIndex === parallelism.pp - 1
475
+
476
+ if (hasEmbedding) {
477
+ sharedParams += modelBreakdown.embeddingParams
478
+ }
479
+
480
+ if (hasOutputHead) {
481
+ sharedParams += modelBreakdown.finalNormParams + modelBreakdown.outputHeadParams
482
+ }
483
+
484
+ return {
485
+ stageParams: sharedParams + expertParams,
486
+ sharedParams,
487
+ expertParams,
488
+ denseLayers: layerMix.denseLayers,
489
+ moeLayers: layerMix.moeLayers,
490
+ hasEmbedding,
491
+ hasOutputHead,
492
+ }
493
+ }
494
+
495
+ const getActivationMemoryBytesPerLayer = ({
496
+ model,
497
+ training,
498
+ parallelism,
499
+ isMoeLayer,
500
+ }: {
501
+ model: ModelConfig
502
+ training: TrainingConfig
503
+ parallelism: ParallelismConfig
504
+ isMoeLayer: boolean
505
+ }) => {
506
+ const activationBytes = getActivationBytes(training.precision)
507
+ const shardedSequenceLength = training.seqLength / parallelism.cp
508
+ const tokensPerShard = training.microBatchSize * shardedSequenceLength
509
+ const kvHiddenDim = model.numKVHeads * (model.hiddenDim / model.numHeads)
510
+ const tpSequenceShardFactor = parallelism.tp > 1 ? parallelism.tp : 1
511
+ // Sequence parallelism shards the residual stream and checkpointed layer boundaries across
512
+ // the TP group. We assume TP-enabled dense training uses this Megatron-style optimization.
513
+ const hiddenStateBytes =
514
+ (tokensPerShard * model.hiddenDim * activationBytes) / tpSequenceShardFactor
515
+ const attentionMultiplier = getAttentionMultiplier(model, training.seqLength)
516
+
517
+ // Sequence-parallel CP reduces the activation footprint by the number of sequence shards.
518
+ const qkvBytes =
519
+ tokensPerShard * (model.hiddenDim + 2 * kvHiddenDim) * activationBytes * attentionMultiplier
520
+ const denseMlpBytes = tokensPerShard * model.intermediateSize * activationBytes * 2
521
+ const moeMlpBytes =
522
+ isMoeLayer && model.moe
523
+ ? (tokensPerShard *
524
+ model.moe.expertIntermediateSize *
525
+ activationBytes *
526
+ model.moe.expertsPerToken *
527
+ 2) /
528
+ Math.max(parallelism.ep, 1)
529
+ : 0
530
+ const shardedIntermediateBytes =
531
+ (qkvBytes + (isMoeLayer ? moeMlpBytes : denseMlpBytes)) / Math.max(parallelism.tp, 1)
532
+
533
+ if (training.activationCheckpointing) {
534
+ return hiddenStateBytes * 2 + shardedIntermediateBytes * 0.25
535
+ }
536
+
537
+ return hiddenStateBytes * 6 + shardedIntermediateBytes * 2
538
+ }
539
+
540
+ const getStageMemory = (
541
+ stageParams: StageParameterCount,
542
+ model: ModelConfig,
543
+ training: TrainingConfig,
544
+ parallelism: ParallelismConfig,
545
+ derivedParallelism: DerivedParallelism,
546
+ ) => {
547
+ const parameterBytes = getParameterBytes(training.precision)
548
+ const gradientBytes = getGradientBytes(training.precision)
549
+ const optimizerBytes = getOptimizerBytesPerParam(training.optimizer, training.precision)
550
+ const fsdpShardFactor =
551
+ parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.fsdpDataParallelDegree : 1
552
+ const distributedShardFactor = parallelism.distributedOptimizer ? derivedParallelism.dp : 1
553
+ const parameterShardFactor =
554
+ parallelism.zeroStage >= 3 ? fsdpShardFactor : 1
555
+ const optimizerShardFactor =
556
+ parallelism.zeroStage >= 1
557
+ ? parallelism.fsdpShardGroupSize > 1
558
+ ? fsdpShardFactor
559
+ : distributedShardFactor
560
+ : 1
561
+ const gradientShardFactor =
562
+ parallelism.zeroStage >= 2
563
+ ? parallelism.fsdpShardGroupSize > 1
564
+ ? fsdpShardFactor
565
+ : derivedParallelism.dp
566
+ : 1
567
+
568
+ const sharedParamsLocal = stageParams.sharedParams / Math.max(parallelism.tp, 1)
569
+ const expertParamsLocal =
570
+ stageParams.expertParams / Math.max(parallelism.tp * parallelism.ep, 1)
571
+ const parameterMemoryBytes =
572
+ (sharedParamsLocal / parameterShardFactor + expertParamsLocal / parameterShardFactor) *
573
+ parameterBytes
574
+ const optimizerMemoryBytes =
575
+ (sharedParamsLocal / optimizerShardFactor + expertParamsLocal / optimizerShardFactor) *
576
+ optimizerBytes
577
+ const gradientMemoryBytes =
578
+ (sharedParamsLocal / gradientShardFactor + expertParamsLocal / gradientShardFactor) *
579
+ gradientBytes
580
+
581
+ const denseLayerActivationBytes = getActivationMemoryBytesPerLayer({
582
+ model,
583
+ training,
584
+ parallelism,
585
+ isMoeLayer: false,
586
+ })
587
+ const moeLayerActivationBytes = getActivationMemoryBytesPerLayer({
588
+ model,
589
+ training,
590
+ parallelism,
591
+ isMoeLayer: true,
592
+ })
593
+ const concurrentMicroBatches = getConcurrentMicroBatches(training, parallelism)
594
+ let activationMemoryBytes =
595
+ (denseLayerActivationBytes * stageParams.denseLayers +
596
+ moeLayerActivationBytes * stageParams.moeLayers) *
597
+ concurrentMicroBatches
598
+
599
+ if (training.activationCheckpointing && stageParams.stageParams > 0) {
600
+ activationMemoryBytes +=
601
+ Math.max(denseLayerActivationBytes, moeLayerActivationBytes) * 1.5
602
+ }
603
+
604
+ const totalBytes =
605
+ parameterMemoryBytes + optimizerMemoryBytes + gradientMemoryBytes + activationMemoryBytes
606
+
607
+ return {
608
+ parametersGB: bytesToGB(parameterMemoryBytes),
609
+ optimizerStatesGB: bytesToGB(optimizerMemoryBytes),
610
+ gradientsGB: bytesToGB(gradientMemoryBytes),
611
+ activationsGB: bytesToGB(activationMemoryBytes),
612
+ totalGB: bytesToGB(totalBytes),
613
+ }
614
+ }
615
+
616
+ const getStageMemoryMap = (
617
+ model: ModelConfig,
618
+ training: TrainingConfig,
619
+ parallelism: ParallelismConfig,
620
+ derivedParallelism: DerivedParallelism,
621
+ ) => {
622
+ const modelBreakdown = getModelBreakdown(model)
623
+ const layerDistribution = distributeLayers(model.numLayers, parallelism.pp)
624
+ const stageMemory = new Map<number, StageMemory>()
625
+ const stageParameters = new Map<number, StageParameterCount>()
626
+
627
+ for (const stage of layerDistribution) {
628
+ const stageParameterCount = getStageParameterCount(stage, modelBreakdown, parallelism, model)
629
+ stageParameters.set(stage.stageIndex, stageParameterCount)
630
+ stageMemory.set(
631
+ stage.stageIndex,
632
+ getStageMemory(stageParameterCount, model, training, parallelism, derivedParallelism),
633
+ )
634
+ }
635
+
636
+ return {
637
+ modelBreakdown,
638
+ layerDistribution,
639
+ stageMemory,
640
+ stageParameters,
641
+ }
642
+ }
643
+
644
+ const buildPlacement = (
645
+ cluster: ClusterConfig,
646
+ parallelism: ParallelismConfig,
647
+ derivedParallelism: DerivedParallelism,
648
+ requiredGPUs: number,
649
+ ) => {
650
+ const totalGPUs = cluster.gpusPerNode * cluster.numNodes
651
+ const placement: PlacementEntry[] = []
652
+ let nodeIndex = 0
653
+ let localGPUIndex = 0
654
+ let globalGPUIndex = 0
655
+
656
+ for (let replicaGroup = 0; replicaGroup < derivedParallelism.replicaGroups; replicaGroup += 1) {
657
+ for (let fsdpRank = 0; fsdpRank < derivedParallelism.fsdpDataParallelDegree; fsdpRank += 1) {
658
+ const dpReplica = replicaGroup * derivedParallelism.fsdpDataParallelDegree + fsdpRank
659
+
660
+ for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
661
+ for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
662
+ if (localGPUIndex + parallelism.ep * parallelism.tp > cluster.gpusPerNode) {
663
+ nodeIndex += 1
664
+ localGPUIndex = 0
665
+ }
666
+
667
+ for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
668
+ for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
669
+ placement.push({
670
+ globalGPUIndex,
671
+ nodeIndex,
672
+ localGPUIndex,
673
+ tpGroup:
674
+ (((dpReplica * parallelism.pp + ppStage) * parallelism.cp + cpShard) *
675
+ parallelism.ep) +
676
+ epLane,
677
+ tpLane,
678
+ ppStage,
679
+ cpShard,
680
+ epLane,
681
+ dpReplica,
682
+ replicaGroup,
683
+ fsdpRank,
684
+ isActive: globalGPUIndex < requiredGPUs,
685
+ })
686
+
687
+ globalGPUIndex += 1
688
+ localGPUIndex += 1
689
+ }
690
+ }
691
+ }
692
+ }
693
+ }
694
+ }
695
+
696
+ while (placement.length < totalGPUs) {
697
+ if (localGPUIndex >= cluster.gpusPerNode) {
698
+ nodeIndex += 1
699
+ localGPUIndex = 0
700
+ }
701
+
702
+ placement.push({
703
+ globalGPUIndex,
704
+ nodeIndex,
705
+ localGPUIndex,
706
+ tpGroup: -1,
707
+ tpLane: -1,
708
+ ppStage: -1,
709
+ cpShard: -1,
710
+ epLane: -1,
711
+ dpReplica: -1,
712
+ replicaGroup: -1,
713
+ fsdpRank: -1,
714
+ isActive: false,
715
+ })
716
+ globalGPUIndex += 1
717
+ localGPUIndex += 1
718
+ }
719
+
720
+ return placement
721
+ }
722
+
723
+ const getPlacementEntry = (
724
+ placement: PlacementEntry[],
725
+ filters: Partial<
726
+ Pick<
727
+ PlacementEntry,
728
+ 'dpReplica' | 'replicaGroup' | 'fsdpRank' | 'ppStage' | 'cpShard' | 'epLane' | 'tpLane'
729
+ >
730
+ >,
731
+ ) =>
732
+ placement.find(
733
+ (entry) =>
734
+ (filters.dpReplica == null || entry.dpReplica === filters.dpReplica) &&
735
+ (filters.replicaGroup == null || entry.replicaGroup === filters.replicaGroup) &&
736
+ (filters.fsdpRank == null || entry.fsdpRank === filters.fsdpRank) &&
737
+ (filters.ppStage == null || entry.ppStage === filters.ppStage) &&
738
+ (filters.cpShard == null || entry.cpShard === filters.cpShard) &&
739
+ (filters.epLane == null || entry.epLane === filters.epLane) &&
740
+ (filters.tpLane == null || entry.tpLane === filters.tpLane),
741
+ )
742
+
743
+ const getDerivedParallelism = (
744
+ cluster: ClusterConfig,
745
+ parallelism: ParallelismConfig,
746
+ ): DerivedParallelism | null => {
747
+ const totalGPUs = cluster.gpusPerNode * cluster.numNodes
748
+ const modelParallelSize =
749
+ parallelism.tp * parallelism.pp * parallelism.cp * parallelism.ep
750
+
751
+ if (modelParallelSize <= 0 || totalGPUs % modelParallelSize !== 0) {
752
+ return null
753
+ }
754
+
755
+ const dp = totalGPUs / modelParallelSize
756
+ const fsdpGroupSize =
757
+ parallelism.fsdpShardGroupSize > 1 ? parallelism.fsdpShardGroupSize : modelParallelSize
758
+
759
+ if (fsdpGroupSize % modelParallelSize !== 0 || totalGPUs % fsdpGroupSize !== 0) {
760
+ return null
761
+ }
762
+
763
+ return {
764
+ modelParallelSize,
765
+ dp,
766
+ replicaGroups: totalGPUs / fsdpGroupSize,
767
+ fsdpGroupSize,
768
+ fsdpDataParallelDegree: fsdpGroupSize / modelParallelSize,
769
+ }
770
+ }
771
+
772
+ const getMaxBandwidthForCollective = (
773
+ members: PlacementEntry[],
774
+ cluster: ClusterConfig,
775
+ ) => {
776
+ if (members.length <= 1) {
777
+ return {
778
+ bandwidthGBs: cluster.intraNodeBandwidthGBs,
779
+ usesInterNode: false,
780
+ }
781
+ }
782
+
783
+ const nodeSet = new Set(members.map((member) => member.nodeIndex))
784
+ const usesInterNode = nodeSet.size > 1
785
+
786
+ return {
787
+ bandwidthGBs: usesInterNode
788
+ ? cluster.interNodeBandwidthGBs
789
+ : cluster.intraNodeBandwidthGBs,
790
+ usesInterNode,
791
+ }
792
+ }
793
+
794
+ const getRingCommStats = ({
795
+ groupCount,
796
+ groupWidth,
797
+ messageBytes,
798
+ collectiveCount,
799
+ membersForBandwidth,
800
+ cluster,
801
+ totalStepTimeMs,
802
+ }: {
803
+ groupCount: number
804
+ groupWidth: number
805
+ messageBytes: number
806
+ collectiveCount: number
807
+ membersForBandwidth: PlacementEntry[]
808
+ cluster: ClusterConfig
809
+ totalStepTimeMs: number
810
+ }): RingCommStats => {
811
+ if (groupWidth <= 1 || collectiveCount <= 0 || messageBytes <= 0) {
812
+ return {
813
+ volumeBytesPerGpu: 0,
814
+ totalVolumeBytes: 0,
815
+ timePerStepMs: 0,
816
+ linkUtilizationPercent: 0,
817
+ usesInterNode: false,
818
+ }
819
+ }
820
+
821
+ const ringVolumeBytes = (2 * (groupWidth - 1) * messageBytes) / groupWidth
822
+ const volumeBytesPerGpu = ringVolumeBytes * collectiveCount
823
+ const totalVolumeBytes = volumeBytesPerGpu * groupWidth * groupCount
824
+ const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(
825
+ membersForBandwidth,
826
+ cluster,
827
+ )
828
+ const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
829
+ const linkUtilizationPercent =
830
+ totalStepTimeMs > 0
831
+ ? clamp(
832
+ (bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (totalStepTimeMs / 1000))) * 100,
833
+ 0,
834
+ 100,
835
+ )
836
+ : 0
837
+
838
+ return {
839
+ volumeBytesPerGpu,
840
+ totalVolumeBytes,
841
+ timePerStepMs,
842
+ linkUtilizationPercent,
843
+ usesInterNode,
844
+ }
845
+ }
846
+
847
+ export function analyzeCluster(
848
+ model: ModelConfig,
849
+ training: TrainingConfig,
850
+ cluster: ClusterConfig,
851
+ parallelism: ParallelismConfig,
852
+ ): ClusterAnalysis {
853
+ const totalGPUs = cluster.gpusPerNode * cluster.numNodes
854
+ const derivedParallelism = getDerivedParallelism(cluster, parallelism)
855
+ const globalBatchSizeTokens =
856
+ training.microBatchSize *
857
+ training.seqLength *
858
+ training.gradAccumSteps *
859
+ (derivedParallelism?.dp ?? 0)
860
+
861
+ const emptyGpuMap = Array.from({ length: totalGPUs }, (_, globalGPUIndex) => ({
862
+ globalGPUIndex,
863
+ nodeIndex: Math.floor(globalGPUIndex / cluster.gpusPerNode),
864
+ localGPUIndex: globalGPUIndex % cluster.gpusPerNode,
865
+ tpGroup: -1,
866
+ tpLane: -1,
867
+ ppStage: -1,
868
+ cpShard: -1,
869
+ epLane: -1,
870
+ dpReplica: -1,
871
+ replicaGroup: -1,
872
+ fsdpRank: -1,
873
+ memoryUsedGB: 0,
874
+ memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
875
+ isActive: false,
876
+ }))
877
+
878
+ const emptyAnalysis = (): ClusterAnalysis => ({
879
+ feasible: false,
880
+ infeasibilityReason: 'Invalid configuration',
881
+ totalParams: 0,
882
+ activeParamsPerToken: 0,
883
+ globalBatchSizeTokens,
884
+ totalGPUs,
885
+ derivedParallelism: {
886
+ dp: derivedParallelism?.dp ?? 0,
887
+ replicaGroups: derivedParallelism?.replicaGroups ?? 0,
888
+ fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
889
+ fsdpGroupSize: derivedParallelism?.fsdpGroupSize ?? 0,
890
+ ep: parallelism.ep,
891
+ },
892
+ memoryBreakdown: {
893
+ parametersGB: 0,
894
+ optimizerStatesGB: 0,
895
+ gradientsGB: 0,
896
+ activationsGB: 0,
897
+ totalGB: 0,
898
+ hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
899
+ utilizationPercent: 0,
900
+ },
901
+ pipelineStages: [],
902
+ communication: {
903
+ tp: {
904
+ allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
905
+ messageSizeBytes: 0,
906
+ totalVolumePerStepGB: 0,
907
+ timePerStepMs: 0,
908
+ linkUtilizationPercent: 0,
909
+ },
910
+ pp: {
911
+ activationMessageSizeBytes: 0,
912
+ numP2PTransfersPerStep: 0,
913
+ totalVolumePerStepGB: 0,
914
+ timePerStepMs: 0,
915
+ usesInterNode: false,
916
+ },
917
+ cp: {
918
+ collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
919
+ messageSizeBytes: 0,
920
+ totalVolumePerStepGB: 0,
921
+ timePerStepMs: 0,
922
+ linkUtilizationPercent: 0,
923
+ usesInterNode: false,
924
+ },
925
+ fsdp: {
926
+ collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
927
+ messageSizeBytes: 0,
928
+ totalVolumePerStepGB: 0,
929
+ timePerStepMs: 0,
930
+ linkUtilizationPercent: 0,
931
+ usesInterNode: false,
932
+ },
933
+ ep: {
934
+ allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
935
+ messageSizeBytes: 0,
936
+ totalVolumePerStepGB: 0,
937
+ timePerStepMs: 0,
938
+ linkUtilizationPercent: 0,
939
+ usesInterNode: false,
940
+ },
941
+ dp: {
942
+ gradientVolumePerGPU_GB: 0,
943
+ allReduceTimeMs: 0,
944
+ canOverlapWithBackward: false,
945
+ linkUtilizationPercent: 0,
946
+ },
947
+ },
948
+ throughput: {
949
+ computeTimePerStepMs: 0,
950
+ communicationTimePerStepMs: 0,
951
+ pipelineBubbleFraction: 0,
952
+ pipelineBubbleTimeMs: 0,
953
+ totalStepTimeMs: 0,
954
+ tokensPerSecond: 0,
955
+ mfu: 0,
956
+ },
957
+ gpuMap: emptyGpuMap,
958
+ links: [],
959
+ })
960
+
961
+ if (
962
+ training.microBatchSize <= 0 ||
963
+ training.seqLength <= 0 ||
964
+ training.gradAccumSteps <= 0 ||
965
+ parallelism.tp <= 0 ||
966
+ parallelism.pp <= 0 ||
967
+ parallelism.cp <= 0 ||
968
+ parallelism.ep <= 0
969
+ ) {
970
+ const analysis = emptyAnalysis()
971
+ analysis.infeasibilityReason = 'Batch sizes and parallelism degrees must all be positive.'
972
+ return analysis
973
+ }
974
+
975
+ if (parallelism.tp * parallelism.ep > cluster.gpusPerNode) {
976
+ const analysis = emptyAnalysis()
977
+ analysis.infeasibilityReason =
978
+ `TP × EP requires ${parallelism.tp * parallelism.ep} GPUs per node, but nodes only have ${cluster.gpusPerNode}.`
979
+ return analysis
980
+ }
981
+
982
+ if (!derivedParallelism) {
983
+ const analysis = emptyAnalysis()
984
+ analysis.infeasibilityReason =
985
+ `World size ${totalGPUs} must be divisible by TP × PP × CP × EP, and the FSDP shard group must divide the cluster cleanly.`
986
+ return analysis
987
+ }
988
+
989
+ if (model.hiddenDim % model.numHeads !== 0) {
990
+ const analysis = emptyAnalysis()
991
+ analysis.infeasibilityReason =
992
+ `hiddenDim ${model.hiddenDim} must divide evenly across ${model.numHeads} attention heads.`
993
+ return analysis
994
+ }
995
+
996
+ if (model.numHeads % parallelism.tp !== 0) {
997
+ const analysis = emptyAnalysis()
998
+ analysis.infeasibilityReason =
999
+ `TP ${parallelism.tp} must divide the ${model.numHeads} attention heads.`
1000
+ return analysis
1001
+ }
1002
+
1003
+ if (model.numKVHeads % parallelism.tp !== 0) {
1004
+ const analysis = emptyAnalysis()
1005
+ analysis.infeasibilityReason =
1006
+ `TP ${parallelism.tp} should divide the ${model.numKVHeads} KV heads for clean GQA sharding.`
1007
+ return analysis
1008
+ }
1009
+
1010
+ if (training.seqLength % parallelism.cp !== 0) {
1011
+ const analysis = emptyAnalysis()
1012
+ analysis.infeasibilityReason =
1013
+ `CP ${parallelism.cp} must divide the sequence length ${training.seqLength}.`
1014
+ return analysis
1015
+ }
1016
+
1017
+ if (model.architecture === 'moe' && !model.moe) {
1018
+ const analysis = emptyAnalysis()
1019
+ analysis.infeasibilityReason = 'MoE models require expert metadata.'
1020
+ return analysis
1021
+ }
1022
+
1023
+ if (model.architecture === 'moe' && model.moe && model.moe.numExperts % parallelism.ep !== 0) {
1024
+ const analysis = emptyAnalysis()
1025
+ analysis.infeasibilityReason =
1026
+ `EP ${parallelism.ep} must divide the ${model.moe.numExperts} experts.`
1027
+ return analysis
1028
+ }
1029
+
1030
+ const { modelBreakdown, layerDistribution, stageMemory, stageParameters } = getStageMemoryMap(
1031
+ model,
1032
+ training,
1033
+ parallelism,
1034
+ derivedParallelism,
1035
+ )
1036
+ const placement = buildPlacement(cluster, parallelism, derivedParallelism, totalGPUs)
1037
+ const maxStageLayers = Math.max(...layerDistribution.map((stage) => stage.numLayers), 0)
1038
+
1039
+ const pipelineStages = layerDistribution.map((stage) => {
1040
+ const stageMemoryBreakdown = stageMemory.get(stage.stageIndex)
1041
+ const stageParameterCount = stageParameters.get(stage.stageIndex)
1042
+
1043
+ return {
1044
+ stageIndex: stage.stageIndex,
1045
+ layerRange: [stage.startLayer, stage.endLayer] as [number, number],
1046
+ numLayers: stage.numLayers,
1047
+ memoryGB: round2(
1048
+ (stageMemoryBreakdown?.totalGB ?? 0) *
1049
+ parallelism.tp *
1050
+ parallelism.cp *
1051
+ parallelism.ep *
1052
+ derivedParallelism.dp,
1053
+ ),
1054
+ hasEmbedding: stageParameterCount?.hasEmbedding ?? false,
1055
+ hasOutputHead: stageParameterCount?.hasOutputHead ?? false,
1056
+ }
1057
+ })
1058
+
1059
+ const worstStageIndex = pipelineStages.reduce((worstIndex, stage) => {
1060
+ const worstStageMemory = stageMemory.get(worstIndex)?.totalGB ?? 0
1061
+ const candidateStageMemory = stageMemory.get(stage.stageIndex)?.totalGB ?? 0
1062
+ return candidateStageMemory > worstStageMemory ? stage.stageIndex : worstIndex
1063
+ }, 0)
1064
+
1065
+ const worstStageMemory = stageMemory.get(worstStageIndex) ?? {
1066
+ parametersGB: 0,
1067
+ optimizerStatesGB: 0,
1068
+ gradientsGB: 0,
1069
+ activationsGB: 0,
1070
+ totalGB: 0,
1071
+ }
1072
+
1073
+ const pipelineBubbleFraction =
1074
+ parallelism.pp <= 1
1075
+ ? 0
1076
+ : (parallelism.pp - 1) / (training.gradAccumSteps + parallelism.pp - 1)
1077
+ const boundaryStageCount = Math.min(
1078
+ parallelism.pp,
1079
+ Math.max(0, Math.round(pipelineBubbleFraction * parallelism.pp)),
1080
+ )
1081
+
1082
+ const gpuMap = placement.map((entry) => {
1083
+ const stageMemoryBreakdown =
1084
+ entry.ppStage >= 0
1085
+ ? stageMemory.get(entry.ppStage) ?? {
1086
+ parametersGB: 0,
1087
+ optimizerStatesGB: 0,
1088
+ gradientsGB: 0,
1089
+ activationsGB: 0,
1090
+ totalGB: 0,
1091
+ }
1092
+ : {
1093
+ parametersGB: 0,
1094
+ optimizerStatesGB: 0,
1095
+ gradientsGB: 0,
1096
+ activationsGB: 0,
1097
+ totalGB: 0,
1098
+ }
1099
+ const bubbleIdle = entry.ppStage >= parallelism.pp - boundaryStageCount && entry.ppStage >= 0
1100
+
1101
+ return {
1102
+ globalGPUIndex: entry.globalGPUIndex,
1103
+ nodeIndex: entry.nodeIndex,
1104
+ localGPUIndex: entry.localGPUIndex,
1105
+ tpGroup: entry.tpGroup,
1106
+ tpLane: entry.tpLane,
1107
+ ppStage: entry.ppStage,
1108
+ cpShard: entry.cpShard,
1109
+ epLane: entry.epLane,
1110
+ dpReplica: entry.dpReplica,
1111
+ replicaGroup: entry.replicaGroup,
1112
+ fsdpRank: entry.fsdpRank,
1113
+ memoryUsedGB: round2(entry.isActive ? stageMemoryBreakdown.totalGB : 0),
1114
+ memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
1115
+ isActive: entry.isActive && !bubbleIdle,
1116
+ }
1117
+ })
1118
+
1119
+ const activationBytes = getActivationBytes(training.precision)
1120
+ const shardedSequenceLength = training.seqLength / parallelism.cp
1121
+ const tokensPerMicroBatchShard = training.microBatchSize * shardedSequenceLength
1122
+ const collectiveMessageBytes =
1123
+ tokensPerMicroBatchShard * model.hiddenDim * activationBytes
1124
+
1125
+ const attentionComputeMultiplier = 0.65 + 0.35 * getAttentionMultiplier(model, training.seqLength)
1126
+ const activationCheckpointComputeMultiplier = training.activationCheckpointing ? 1.2 : 1
1127
+ const totalFlopsPerStep =
1128
+ 6 *
1129
+ modelBreakdown.activeParamsPerToken *
1130
+ training.microBatchSize *
1131
+ training.seqLength *
1132
+ training.gradAccumSteps *
1133
+ derivedParallelism.dp *
1134
+ attentionComputeMultiplier *
1135
+ activationCheckpointComputeMultiplier
1136
+ const launchedGPUs = Math.max(totalGPUs, 1)
1137
+ const flopsPerGpuPerStep = totalFlopsPerStep / launchedGPUs
1138
+ const peakTFLOPs = getPeakTFLOPsForPrecision(cluster.gpuType, training.precision)
1139
+ const sustainedTFLOPs = peakTFLOPs * getSustainedComputeEfficiency(training)
1140
+ const computeTimePerStepMs = (flopsPerGpuPerStep / (sustainedTFLOPs * 1e12)) * 1000
1141
+ const pipelineBubbleTimeMs =
1142
+ pipelineBubbleFraction >= 1
1143
+ ? 0
1144
+ : (computeTimePerStepMs * pipelineBubbleFraction) / (1 - pipelineBubbleFraction)
1145
+
1146
+ const tentativeTotalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs
1147
+
1148
+ const tpMembers = placement.filter(
1149
+ (entry) =>
1150
+ entry.dpReplica === 0 &&
1151
+ entry.ppStage === 0 &&
1152
+ entry.cpShard === 0 &&
1153
+ entry.epLane === 0 &&
1154
+ entry.tpLane >= 0,
1155
+ )
1156
+ const tpStats = getRingCommStats({
1157
+ groupCount: parallelism.pp * parallelism.cp * parallelism.ep * derivedParallelism.dp,
1158
+ groupWidth: parallelism.tp,
1159
+ messageBytes: collectiveMessageBytes,
1160
+ collectiveCount: TP_ALL_REDUCES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
1161
+ membersForBandwidth: tpMembers,
1162
+ cluster,
1163
+ totalStepTimeMs: tentativeTotalStepTimeMs,
1164
+ })
1165
+
1166
+ const cpMembers = placement.filter(
1167
+ (entry) =>
1168
+ entry.dpReplica === 0 &&
1169
+ entry.ppStage === 0 &&
1170
+ entry.epLane === 0 &&
1171
+ entry.tpLane === 0 &&
1172
+ entry.cpShard >= 0,
1173
+ )
1174
+ const cpStats = getRingCommStats({
1175
+ groupCount: parallelism.pp * derivedParallelism.dp * parallelism.tp * parallelism.ep,
1176
+ groupWidth: parallelism.cp,
1177
+ messageBytes: collectiveMessageBytes,
1178
+ collectiveCount: CP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
1179
+ membersForBandwidth: cpMembers,
1180
+ cluster,
1181
+ totalStepTimeMs: tentativeTotalStepTimeMs,
1182
+ })
1183
+
1184
+ const averageSharedLayerParams =
1185
+ model.numLayers > 0
1186
+ ? (modelBreakdown.denseLayerCount * modelBreakdown.sharedDenseLayerParams +
1187
+ modelBreakdown.moeLayerCount * modelBreakdown.sharedMoeLayerParams) /
1188
+ model.numLayers
1189
+ : 0
1190
+ const fsdpMessageBytes =
1191
+ parallelism.zeroStage >= 3 && derivedParallelism.fsdpDataParallelDegree > 1
1192
+ ? (averageSharedLayerParams / parallelism.tp / derivedParallelism.fsdpDataParallelDegree) *
1193
+ getParameterBytes(training.precision)
1194
+ : 0
1195
+ const fsdpMembers = placement.filter(
1196
+ (entry) =>
1197
+ entry.replicaGroup === 0 &&
1198
+ entry.ppStage === 0 &&
1199
+ entry.cpShard === 0 &&
1200
+ entry.epLane === 0 &&
1201
+ entry.tpLane === 0,
1202
+ )
1203
+ const fsdpStats = getRingCommStats({
1204
+ groupCount:
1205
+ derivedParallelism.replicaGroups *
1206
+ parallelism.pp *
1207
+ parallelism.cp *
1208
+ parallelism.ep *
1209
+ parallelism.tp,
1210
+ groupWidth: derivedParallelism.fsdpDataParallelDegree,
1211
+ messageBytes: fsdpMessageBytes,
1212
+ collectiveCount: FSDP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
1213
+ membersForBandwidth: fsdpMembers,
1214
+ cluster,
1215
+ totalStepTimeMs: tentativeTotalStepTimeMs,
1216
+ })
1217
+
1218
+ const epMembers = placement.filter(
1219
+ (entry) =>
1220
+ entry.dpReplica === 0 &&
1221
+ entry.ppStage === 0 &&
1222
+ entry.cpShard === 0 &&
1223
+ entry.tpLane === 0 &&
1224
+ entry.epLane >= 0,
1225
+ )
1226
+ const moeLayerCount = modelBreakdown.moeLayerCount
1227
+ const epMessageBytes =
1228
+ model.architecture === 'moe' && model.moe
1229
+ ? tokensPerMicroBatchShard *
1230
+ model.hiddenDim *
1231
+ activationBytes *
1232
+ model.moe.expertsPerToken
1233
+ : 0
1234
+ const epTransferCount = EP_ALL_TO_ALLS_PER_LAYER * moeLayerCount * training.gradAccumSteps
1235
+ const epStats = (() => {
1236
+ if (parallelism.ep <= 1 || epTransferCount <= 0 || epMessageBytes <= 0) {
1237
+ return {
1238
+ totalVolumeBytes: 0,
1239
+ timePerStepMs: 0,
1240
+ linkUtilizationPercent: 0,
1241
+ usesInterNode: false,
1242
+ }
1243
+ }
1244
+
1245
+ const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(epMembers, cluster)
1246
+ const volumeBytesPerGpu = epMessageBytes * epTransferCount * 2
1247
+ const totalVolumeBytes =
1248
+ volumeBytesPerGpu *
1249
+ parallelism.ep *
1250
+ parallelism.pp *
1251
+ parallelism.cp *
1252
+ parallelism.tp *
1253
+ derivedParallelism.dp
1254
+ const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
1255
+ const linkUtilizationPercent =
1256
+ tentativeTotalStepTimeMs > 0
1257
+ ? clamp(
1258
+ (bytesToGB(volumeBytesPerGpu) /
1259
+ (bandwidthGBs * (tentativeTotalStepTimeMs / 1000))) *
1260
+ 100,
1261
+ 0,
1262
+ 100,
1263
+ )
1264
+ : 0
1265
+
1266
+ return {
1267
+ totalVolumeBytes,
1268
+ timePerStepMs,
1269
+ linkUtilizationPercent,
1270
+ usesInterNode,
1271
+ }
1272
+ })()
1273
+
1274
+ let ppTotalVolumeBytes = 0
1275
+ let ppTimePerStepMs = 0
1276
+ let ppUsesInterNode = false
1277
+
1278
+ for (let dpReplica = 0; dpReplica < derivedParallelism.dp; dpReplica += 1) {
1279
+ for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
1280
+ for (let stageIndex = 0; stageIndex < parallelism.pp - 1; stageIndex += 1) {
1281
+ const source = getPlacementEntry(placement, {
1282
+ dpReplica,
1283
+ ppStage: stageIndex,
1284
+ cpShard,
1285
+ epLane: 0,
1286
+ tpLane: 0,
1287
+ })
1288
+ const target = getPlacementEntry(placement, {
1289
+ dpReplica,
1290
+ ppStage: stageIndex + 1,
1291
+ cpShard,
1292
+ epLane: 0,
1293
+ tpLane: 0,
1294
+ })
1295
+
1296
+ if (!source || !target) {
1297
+ continue
1298
+ }
1299
+
1300
+ const usesInterNode = source.nodeIndex !== target.nodeIndex
1301
+ const bandwidthGBs = usesInterNode
1302
+ ? cluster.interNodeBandwidthGBs
1303
+ : cluster.intraNodeBandwidthGBs
1304
+ const perLaneBytes = collectiveMessageBytes / parallelism.tp
1305
+
1306
+ ppUsesInterNode ||= usesInterNode
1307
+ ppTotalVolumeBytes += collectiveMessageBytes * 2 * training.gradAccumSteps
1308
+ ppTimePerStepMs +=
1309
+ (bytesToGB(perLaneBytes) / bandwidthGBs) * 1000 * 2 * training.gradAccumSteps
1310
+ }
1311
+ }
1312
+ }
1313
+
1314
+ const maxStageGradientBytes = Math.max(
1315
+ ...Array.from(stageMemory.values()).map((stage) => stage.gradientsGB * BYTES_PER_GB),
1316
+ 0,
1317
+ )
1318
+ const dpGroupWidth =
1319
+ parallelism.fsdpShardGroupSize > 1
1320
+ ? derivedParallelism.replicaGroups
1321
+ : derivedParallelism.dp
1322
+ const dpMembers = parallelism.fsdpShardGroupSize > 1
1323
+ ? placement.filter(
1324
+ (entry) =>
1325
+ entry.fsdpRank === 0 &&
1326
+ entry.ppStage === 0 &&
1327
+ entry.cpShard === 0 &&
1328
+ entry.epLane === 0 &&
1329
+ entry.tpLane === 0,
1330
+ )
1331
+ : placement.filter(
1332
+ (entry) =>
1333
+ entry.ppStage === 0 &&
1334
+ entry.cpShard === 0 &&
1335
+ entry.epLane === 0 &&
1336
+ entry.tpLane === 0,
1337
+ )
1338
+ const gradientCommBytesPerGpu =
1339
+ dpGroupWidth > 1
1340
+ ? (2 * (dpGroupWidth - 1) * maxStageGradientBytes) / dpGroupWidth
1341
+ : 0
1342
+ const dpBandwidth = getMaxBandwidthForCollective(dpMembers, cluster)
1343
+ const dpTimeMs =
1344
+ dpGroupWidth > 1
1345
+ ? (bytesToGB(gradientCommBytesPerGpu) / dpBandwidth.bandwidthGBs) * 1000
1346
+ : 0
1347
+ const canOverlapDp = dpGroupWidth > 1 && (parallelism.pp > 1 || training.gradAccumSteps > 1)
1348
+ const dpNonOverlappedTimeMs = dpTimeMs * (canOverlapDp ? 0.35 : 1)
1349
+
1350
+ const communicationTimePerStepMs =
1351
+ tpStats.timePerStepMs +
1352
+ cpStats.timePerStepMs +
1353
+ fsdpStats.timePerStepMs +
1354
+ epStats.timePerStepMs +
1355
+ ppTimePerStepMs +
1356
+ dpNonOverlappedTimeMs
1357
+ const totalStepTimeMs =
1358
+ computeTimePerStepMs + pipelineBubbleTimeMs + communicationTimePerStepMs
1359
+ const tokensPerSecond =
1360
+ totalStepTimeMs > 0 ? globalBatchSizeTokens / (totalStepTimeMs / 1000) : 0
1361
+ const mfu =
1362
+ tokensPerSecond > 0
1363
+ ? clamp(
1364
+ (6 * modelBreakdown.activeParamsPerToken * attentionComputeMultiplier * tokensPerSecond) /
1365
+ (launchedGPUs * peakTFLOPs * 1e12),
1366
+ 0,
1367
+ 1,
1368
+ )
1369
+ : 0
1370
+
1371
+ const dpLinkUtilizationPercent =
1372
+ dpGroupWidth > 1 && totalStepTimeMs > 0
1373
+ ? clamp(
1374
+ (bytesToGB(gradientCommBytesPerGpu) /
1375
+ (dpBandwidth.bandwidthGBs * (totalStepTimeMs / 1000))) *
1376
+ 100,
1377
+ 0,
1378
+ 100,
1379
+ )
1380
+ : 0
1381
+
1382
+ const ppPerLaneVolumeGB =
1383
+ parallelism.pp > 1
1384
+ ? bytesToGB(collectiveMessageBytes / parallelism.tp) * 2 * training.gradAccumSteps
1385
+ : 0
1386
+ const ppLinkUtilizationPercent =
1387
+ parallelism.pp > 1 && totalStepTimeMs > 0
1388
+ ? clamp(
1389
+ (ppPerLaneVolumeGB /
1390
+ ((ppUsesInterNode
1391
+ ? cluster.interNodeBandwidthGBs
1392
+ : cluster.intraNodeBandwidthGBs) *
1393
+ (totalStepTimeMs / 1000))) *
1394
+ 100,
1395
+ 0,
1396
+ 100,
1397
+ )
1398
+ : 0
1399
+
1400
+ const links: ClusterAnalysis['links'] = []
1401
+ const visualReplicaSamples = Math.min(derivedParallelism.dp, 12)
1402
+ const sampledDpReplicas = Array.from({ length: visualReplicaSamples }, (_, sampleIndex) =>
1403
+ Math.floor((sampleIndex * derivedParallelism.dp) / visualReplicaSamples),
1404
+ )
1405
+
1406
+ for (const dpReplica of sampledDpReplicas) {
1407
+ for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
1408
+ for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
1409
+ for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
1410
+ const tpEntries = placement
1411
+ .filter(
1412
+ (entry) =>
1413
+ entry.dpReplica === dpReplica &&
1414
+ entry.ppStage === ppStage &&
1415
+ entry.cpShard === cpShard &&
1416
+ entry.epLane === epLane,
1417
+ )
1418
+ .sort((left, right) => left.tpLane - right.tpLane)
1419
+
1420
+ if (parallelism.tp > 1) {
1421
+ for (let lane = 0; lane < tpEntries.length; lane += 1) {
1422
+ const from = tpEntries[lane]
1423
+ const to = tpEntries[(lane + 1) % tpEntries.length]
1424
+
1425
+ links.push({
1426
+ fromGPU: from.globalGPUIndex,
1427
+ toGPU: to.globalGPUIndex,
1428
+ type: 'nvlink',
1429
+ trafficType: 'tp',
1430
+ volumeGB: round2(bytesToGB(tpStats.volumeBytesPerGpu)),
1431
+ utilizationPercent: round2(tpStats.linkUtilizationPercent),
1432
+ })
1433
+ }
1434
+ }
1435
+
1436
+ if (ppStage < parallelism.pp - 1) {
1437
+ const nextTpEntries = placement
1438
+ .filter(
1439
+ (entry) =>
1440
+ entry.dpReplica === dpReplica &&
1441
+ entry.ppStage === ppStage + 1 &&
1442
+ entry.cpShard === cpShard &&
1443
+ entry.epLane === epLane,
1444
+ )
1445
+ .sort((left, right) => left.tpLane - right.tpLane)
1446
+
1447
+ for (let lane = 0; lane < Math.min(tpEntries.length, nextTpEntries.length); lane += 1) {
1448
+ const from = tpEntries[lane]
1449
+ const to = nextTpEntries[lane]
1450
+ links.push({
1451
+ fromGPU: from.globalGPUIndex,
1452
+ toGPU: to.globalGPUIndex,
1453
+ type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
1454
+ trafficType: 'pp',
1455
+ volumeGB: round2(ppPerLaneVolumeGB),
1456
+ utilizationPercent: round2(ppLinkUtilizationPercent),
1457
+ })
1458
+ }
1459
+ }
1460
+ }
1461
+ }
1462
+
1463
+ if (parallelism.cp > 1) {
1464
+ for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
1465
+ for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
1466
+ const cpEntries = placement
1467
+ .filter(
1468
+ (entry) =>
1469
+ entry.dpReplica === dpReplica &&
1470
+ entry.ppStage === ppStage &&
1471
+ entry.epLane === epLane &&
1472
+ entry.tpLane === tpLane,
1473
+ )
1474
+ .sort((left, right) => left.cpShard - right.cpShard)
1475
+
1476
+ for (let shardIndex = 0; shardIndex < cpEntries.length; shardIndex += 1) {
1477
+ const from = cpEntries[shardIndex]
1478
+ const to = cpEntries[(shardIndex + 1) % cpEntries.length]
1479
+ links.push({
1480
+ fromGPU: from.globalGPUIndex,
1481
+ toGPU: to.globalGPUIndex,
1482
+ type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
1483
+ trafficType: 'cp',
1484
+ volumeGB: round2(bytesToGB(cpStats.volumeBytesPerGpu)),
1485
+ utilizationPercent: round2(cpStats.linkUtilizationPercent),
1486
+ })
1487
+ }
1488
+ }
1489
+ }
1490
+ }
1491
+
1492
+ if (parallelism.ep > 1) {
1493
+ for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
1494
+ for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
1495
+ const epEntries = placement
1496
+ .filter(
1497
+ (entry) =>
1498
+ entry.dpReplica === dpReplica &&
1499
+ entry.ppStage === ppStage &&
1500
+ entry.cpShard === cpShard &&
1501
+ entry.tpLane === tpLane,
1502
+ )
1503
+ .sort((left, right) => left.epLane - right.epLane)
1504
+
1505
+ for (let lane = 0; lane < epEntries.length; lane += 1) {
1506
+ const from = epEntries[lane]
1507
+ const to = epEntries[(lane + 1) % epEntries.length]
1508
+ links.push({
1509
+ fromGPU: from.globalGPUIndex,
1510
+ toGPU: to.globalGPUIndex,
1511
+ type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
1512
+ trafficType: 'ep',
1513
+ volumeGB: round2(
1514
+ epStats.totalVolumeBytes > 0
1515
+ ? bytesToGB(epStats.totalVolumeBytes) /
1516
+ (parallelism.ep *
1517
+ Math.max(parallelism.tp * parallelism.cp * parallelism.pp * derivedParallelism.dp, 1))
1518
+ : 0,
1519
+ ),
1520
+ utilizationPercent: round2(epStats.linkUtilizationPercent),
1521
+ })
1522
+ }
1523
+ }
1524
+ }
1525
+ }
1526
+
1527
+ if (derivedParallelism.fsdpDataParallelDegree > 1) {
1528
+ for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
1529
+ for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
1530
+ for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
1531
+ const fsdpEntries = placement
1532
+ .filter(
1533
+ (entry) =>
1534
+ entry.replicaGroup === placement.find((item) => item.dpReplica === dpReplica)?.replicaGroup &&
1535
+ entry.ppStage === ppStage &&
1536
+ entry.cpShard === cpShard &&
1537
+ entry.epLane === epLane &&
1538
+ entry.tpLane === tpLane,
1539
+ )
1540
+ .sort((left, right) => left.fsdpRank - right.fsdpRank)
1541
+
1542
+ for (let rank = 0; rank < fsdpEntries.length; rank += 1) {
1543
+ const from = fsdpEntries[rank]
1544
+ const to = fsdpEntries[(rank + 1) % fsdpEntries.length]
1545
+ links.push({
1546
+ fromGPU: from.globalGPUIndex,
1547
+ toGPU: to.globalGPUIndex,
1548
+ type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
1549
+ trafficType: 'fsdp',
1550
+ volumeGB: round2(bytesToGB(fsdpStats.volumeBytesPerGpu)),
1551
+ utilizationPercent: round2(fsdpStats.linkUtilizationPercent),
1552
+ })
1553
+ }
1554
+ }
1555
+ }
1556
+ }
1557
+ }
1558
+
1559
+ if (dpGroupWidth > 1) {
1560
+ for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
1561
+ for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
1562
+ for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
1563
+ const current = placement.find((entry) => entry.dpReplica === dpReplica)
1564
+ if (!current) {
1565
+ continue
1566
+ }
1567
+
1568
+ const from = getPlacementEntry(placement, {
1569
+ replicaGroup:
1570
+ parallelism.fsdpShardGroupSize > 1 ? current.replicaGroup : undefined,
1571
+ fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
1572
+ dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : dpReplica,
1573
+ ppStage,
1574
+ cpShard,
1575
+ epLane,
1576
+ tpLane,
1577
+ })
1578
+ const to = getPlacementEntry(placement, {
1579
+ replicaGroup:
1580
+ parallelism.fsdpShardGroupSize > 1
1581
+ ? (current.replicaGroup + 1) % derivedParallelism.replicaGroups
1582
+ : undefined,
1583
+ fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
1584
+ dpReplica:
1585
+ parallelism.fsdpShardGroupSize > 1
1586
+ ? undefined
1587
+ : (dpReplica + 1) % derivedParallelism.dp,
1588
+ ppStage,
1589
+ cpShard,
1590
+ epLane,
1591
+ tpLane,
1592
+ })
1593
+
1594
+ if (!from || !to) {
1595
+ continue
1596
+ }
1597
+
1598
+ links.push({
1599
+ fromGPU: from.globalGPUIndex,
1600
+ toGPU: to.globalGPUIndex,
1601
+ type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
1602
+ trafficType: 'dp',
1603
+ volumeGB: round2(bytesToGB(gradientCommBytesPerGpu)),
1604
+ utilizationPercent: round2(dpLinkUtilizationPercent),
1605
+ })
1606
+ }
1607
+ }
1608
+ }
1609
+ }
1610
+ }
1611
+ }
1612
+
1613
+ const feasible = worstStageMemory.totalGB <= cluster.gpuType.hbmCapacityGB
1614
+ const infeasibilityReason = feasible
1615
+ ? undefined
1616
+ : `Stage ${worstStageIndex} uses ${round2(worstStageMemory.totalGB)} GB per GPU, exceeding ${cluster.gpuType.hbmCapacityGB} GB of HBM.`
1617
+
1618
+ return {
1619
+ feasible,
1620
+ infeasibilityReason,
1621
+ totalParams: Math.round(modelBreakdown.totalParams),
1622
+ activeParamsPerToken: Math.round(modelBreakdown.activeParamsPerToken),
1623
+ globalBatchSizeTokens,
1624
+ totalGPUs,
1625
+ derivedParallelism: {
1626
+ dp: derivedParallelism.dp,
1627
+ replicaGroups: derivedParallelism.replicaGroups,
1628
+ fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
1629
+ fsdpGroupSize: derivedParallelism.fsdpGroupSize,
1630
+ ep: parallelism.ep,
1631
+ },
1632
+ memoryBreakdown: {
1633
+ parametersGB: round2(worstStageMemory.parametersGB),
1634
+ optimizerStatesGB: round2(worstStageMemory.optimizerStatesGB),
1635
+ gradientsGB: round2(worstStageMemory.gradientsGB),
1636
+ activationsGB: round2(worstStageMemory.activationsGB),
1637
+ totalGB: round2(worstStageMemory.totalGB),
1638
+ hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
1639
+ utilizationPercent: round2(
1640
+ (worstStageMemory.totalGB / cluster.gpuType.hbmCapacityGB) * 100,
1641
+ ),
1642
+ },
1643
+ pipelineStages,
1644
+ communication: {
1645
+ tp: {
1646
+ allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
1647
+ messageSizeBytes: collectiveMessageBytes,
1648
+ totalVolumePerStepGB: round2(bytesToGB(tpStats.totalVolumeBytes)),
1649
+ timePerStepMs: round2(tpStats.timePerStepMs),
1650
+ linkUtilizationPercent: round2(tpStats.linkUtilizationPercent),
1651
+ },
1652
+ pp: {
1653
+ activationMessageSizeBytes: collectiveMessageBytes,
1654
+ numP2PTransfersPerStep:
1655
+ parallelism.pp > 1
1656
+ ? 2 *
1657
+ (parallelism.pp - 1) *
1658
+ training.gradAccumSteps *
1659
+ parallelism.cp *
1660
+ parallelism.tp *
1661
+ derivedParallelism.dp
1662
+ : 0,
1663
+ totalVolumePerStepGB: round2(bytesToGB(ppTotalVolumeBytes)),
1664
+ timePerStepMs: round2(ppTimePerStepMs),
1665
+ usesInterNode: ppUsesInterNode,
1666
+ },
1667
+ cp: {
1668
+ collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
1669
+ messageSizeBytes: collectiveMessageBytes,
1670
+ totalVolumePerStepGB: round2(bytesToGB(cpStats.totalVolumeBytes)),
1671
+ timePerStepMs: round2(cpStats.timePerStepMs),
1672
+ linkUtilizationPercent: round2(cpStats.linkUtilizationPercent),
1673
+ usesInterNode: cpStats.usesInterNode,
1674
+ },
1675
+ fsdp: {
1676
+ collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
1677
+ messageSizeBytes: round2(fsdpMessageBytes),
1678
+ totalVolumePerStepGB: round2(bytesToGB(fsdpStats.totalVolumeBytes)),
1679
+ timePerStepMs: round2(fsdpStats.timePerStepMs),
1680
+ linkUtilizationPercent: round2(fsdpStats.linkUtilizationPercent),
1681
+ usesInterNode: fsdpStats.usesInterNode,
1682
+ },
1683
+ ep: {
1684
+ allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
1685
+ messageSizeBytes: round2(epMessageBytes),
1686
+ totalVolumePerStepGB: round2(bytesToGB(epStats.totalVolumeBytes)),
1687
+ timePerStepMs: round2(epStats.timePerStepMs),
1688
+ linkUtilizationPercent: round2(epStats.linkUtilizationPercent),
1689
+ usesInterNode: epStats.usesInterNode,
1690
+ },
1691
+ dp: {
1692
+ gradientVolumePerGPU_GB: round2(bytesToGB(gradientCommBytesPerGpu)),
1693
+ allReduceTimeMs: round2(dpTimeMs),
1694
+ canOverlapWithBackward: canOverlapDp,
1695
+ linkUtilizationPercent: round2(dpLinkUtilizationPercent),
1696
+ },
1697
+ },
1698
+ throughput: {
1699
+ computeTimePerStepMs: round2(computeTimePerStepMs),
1700
+ communicationTimePerStepMs: round2(communicationTimePerStepMs),
1701
+ pipelineBubbleFraction: round2(pipelineBubbleFraction),
1702
+ pipelineBubbleTimeMs: round2(pipelineBubbleTimeMs),
1703
+ totalStepTimeMs: round2(totalStepTimeMs),
1704
+ tokensPerSecond: round2(tokensPerSecond),
1705
+ mfu: round2(mfu),
1706
+ },
1707
+ gpuMap,
1708
+ links,
1709
+ }
1710
+ }
1711
+
1712
+ export const llama7B = (): ModelConfig => ({
1713
+ architecture: 'dense',
1714
+ hiddenDim: 4096,
1715
+ numLayers: 32,
1716
+ numHeads: 32,
1717
+ numKVHeads: 32,
1718
+ vocabSize: 32000,
1719
+ intermediateSize: 11008,
1720
+ tiedEmbeddings: false,
1721
+ attentionProfile: {
1722
+ type: 'full',
1723
+ },
1724
+ })
1725
+
1726
+ export const llama70B = (): ModelConfig => ({
1727
+ architecture: 'dense',
1728
+ hiddenDim: 8192,
1729
+ numLayers: 80,
1730
+ numHeads: 64,
1731
+ numKVHeads: 8,
1732
+ vocabSize: 32000,
1733
+ intermediateSize: 28672,
1734
+ tiedEmbeddings: false,
1735
+ attentionProfile: {
1736
+ type: 'full',
1737
+ },
1738
+ })
1739
+
1740
+ export const llama405B = (): ModelConfig => ({
1741
+ architecture: 'dense',
1742
+ hiddenDim: 16384,
1743
+ numLayers: 126,
1744
+ numHeads: 128,
1745
+ numKVHeads: 8,
1746
+ vocabSize: 128256,
1747
+ intermediateSize: 53248,
1748
+ tiedEmbeddings: false,
1749
+ attentionProfile: {
1750
+ type: 'full',
1751
+ },
1752
+ })
1753
+
1754
+ export const olmo3_32B = (): ModelConfig => ({
1755
+ architecture: 'dense',
1756
+ hiddenDim: 5120,
1757
+ numLayers: 64,
1758
+ numHeads: 40,
1759
+ numKVHeads: 8,
1760
+ vocabSize: 100278,
1761
+ intermediateSize: 27648,
1762
+ tiedEmbeddings: false,
1763
+ attentionProfile: {
1764
+ type: 'hybrid',
1765
+ slidingWindowSize: 4096,
1766
+ globalAttentionFraction: 0.25,
1767
+ },
1768
+ })
1769
+
1770
+ export const llama31_405B = (): ModelConfig => ({
1771
+ architecture: 'dense',
1772
+ hiddenDim: 16384,
1773
+ numLayers: 126,
1774
+ numHeads: 128,
1775
+ numKVHeads: 8,
1776
+ vocabSize: 128256,
1777
+ intermediateSize: 53248,
1778
+ tiedEmbeddings: false,
1779
+ attentionProfile: {
1780
+ type: 'full',
1781
+ },
1782
+ })
1783
+
1784
+ export const trinityLarge400B = (): ModelConfig => ({
1785
+ architecture: 'moe',
1786
+ hiddenDim: 3072,
1787
+ numLayers: 60,
1788
+ numHeads: 48,
1789
+ numKVHeads: 8,
1790
+ vocabSize: 200192,
1791
+ intermediateSize: 12288,
1792
+ tiedEmbeddings: false,
1793
+ attentionProfile: {
1794
+ type: 'hybrid',
1795
+ slidingWindowSize: 4096,
1796
+ globalAttentionEveryN: 4,
1797
+ },
1798
+ moe: {
1799
+ numExperts: 256,
1800
+ expertsPerToken: 4,
1801
+ numDenseLayers: 6,
1802
+ expertIntermediateSize: 3072,
1803
+ activeParamsPerToken: 13_000_000_000,
1804
+ },
1805
+ })
1806
+
1807
+ export const a100_80gb = (): GPUSpec => ({
1808
+ name: 'A100 80GB',
1809
+ hbmCapacityGB: 80,
1810
+ peakTFLOPsBF16: 312,
1811
+ memBandwidthTBs: 2,
1812
+ })
1813
+
1814
+ export const h100_sxm = (): GPUSpec => ({
1815
+ name: 'H100 SXM',
1816
+ hbmCapacityGB: 80,
1817
+ peakTFLOPsBF16: 989,
1818
+ memBandwidthTBs: 3.35,
1819
+ })
1820
+
1821
+ export const b300 = (): GPUSpec => ({
1822
+ name: 'B300',
1823
+ hbmCapacityGB: 192,
1824
+ peakTFLOPsBF16: 2250,
1825
+ memBandwidthTBs: 8,
1826
+ })
1827
+
1828
+ export const gb200 = (): GPUSpec => ({
1829
+ name: 'GB200',
1830
+ hbmCapacityGB: 192,
1831
+ peakTFLOPsBF16: 2250,
1832
+ memBandwidthTBs: 8,
1833
+ })
1834
+
1835
+ export const singleNode8GPU = (gpuType: GPUSpec = a100_80gb()): ClusterConfig => {
1836
+ const fabric = getDefaultFabric(gpuType)
1837
+
1838
+ return {
1839
+ gpuType,
1840
+ gpusPerNode: 8,
1841
+ numNodes: 1,
1842
+ intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
1843
+ interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
1844
+ nodesPerRack: 1,
1845
+ rackLabel: 'node',
1846
+ nodeLabel: 'GPU host',
1847
+ podLabel: 'node',
1848
+ }
1849
+ }
1850
+
1851
+ export const cluster64GPU = (gpuType: GPUSpec = h100_sxm()): ClusterConfig => {
1852
+ const fabric = getDefaultFabric(gpuType)
1853
+
1854
+ return {
1855
+ gpuType,
1856
+ gpusPerNode: 8,
1857
+ numNodes: 8,
1858
+ intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
1859
+ interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
1860
+ nodesPerRack: 4,
1861
+ rackLabel: 'rack',
1862
+ nodeLabel: 'GPU host',
1863
+ podLabel: 'rack',
1864
+ }
1865
+ }
1866
+
1867
+ export const frontier576GPU = (): ClusterConfig => {
1868
+ const gpuType = gb200()
1869
+ const fabric = getDefaultFabric(gpuType)
1870
+
1871
+ return {
1872
+ gpuType,
1873
+ gpusPerNode: 8,
1874
+ numNodes: 72,
1875
+ intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
1876
+ interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
1877
+ nodesPerRack: 9,
1878
+ rackLabel: 'NVL72 rack',
1879
+ nodeLabel: 'compute tray',
1880
+ podLabel: 'rack',
1881
+ }
1882
+ }
src/lib/viewOptions.ts ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ getScenarioWorkbenchConfig,
3
+ type WorkbenchConfig,
4
+ type WorkbenchScenarioId,
5
+ } from './workbench'
6
+
7
+ export type ViewOptions = {
8
+ debug: boolean
9
+ snapshot: boolean
10
+ scenario: WorkbenchScenarioId
11
+ }
12
+
13
+ const SCENARIOS = new Set<WorkbenchScenarioId>([
14
+ 'default',
15
+ 'olmo-pretraining',
16
+ 'olmo-long-context',
17
+ 'llama-pretraining',
18
+ 'llama-long-context',
19
+ 'trinity-pretraining',
20
+ 'trinity-long-context',
21
+ 'infeasible-memory',
22
+ ])
23
+
24
+ const truthyValues = new Set(['1', 'true', 'yes', 'on'])
25
+
26
+ function isTruthy(value: string | null) {
27
+ if (value === null) {
28
+ return false
29
+ }
30
+
31
+ return truthyValues.has(value.toLowerCase())
32
+ }
33
+
34
+ export function getViewOptions(search = window.location.search): ViewOptions {
35
+ const params = new URLSearchParams(search)
36
+ const scenarioParam = params.get('scenario')
37
+ const scenario = SCENARIOS.has(scenarioParam as WorkbenchScenarioId)
38
+ ? (scenarioParam as WorkbenchScenarioId)
39
+ : 'default'
40
+
41
+ return {
42
+ debug: isTruthy(params.get('debug')),
43
+ snapshot: isTruthy(params.get('snapshot')),
44
+ scenario,
45
+ }
46
+ }
47
+
48
+ export function getScenarioConfig(scenario: WorkbenchScenarioId): WorkbenchConfig {
49
+ return getScenarioWorkbenchConfig(scenario)
50
+ }
src/lib/workbench.ts ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ a100_80gb,
3
+ b300,
4
+ gb200,
5
+ h100_sxm,
6
+ llama31_405B,
7
+ olmo3_32B,
8
+ trinityLarge400B,
9
+ type ClusterConfig,
10
+ type GPUSpec,
11
+ type ModelConfig,
12
+ type ParallelismConfig,
13
+ type TrainingConfig,
14
+ } from './trainingClusterModel'
15
+
16
+ export type ExamplePresetId = 'olmo3-32b' | 'llama31-405b' | 'trinity-large-400b'
17
+ export type ExamplePhaseId = 'pretraining' | 'long-context'
18
+ export type GpuPresetId = 'a100-80gb' | 'h100-sxm' | 'b300' | 'gb200'
19
+ export type WorkbenchScenarioId =
20
+ | 'default'
21
+ | 'olmo-pretraining'
22
+ | 'olmo-long-context'
23
+ | 'llama-pretraining'
24
+ | 'llama-long-context'
25
+ | 'trinity-pretraining'
26
+ | 'trinity-long-context'
27
+ | 'infeasible-memory'
28
+
29
+ export type WorkbenchConfig = {
30
+ examplePresetId: ExamplePresetId
31
+ phaseId: ExamplePhaseId
32
+ customized: boolean
33
+ model: ModelConfig
34
+ training: TrainingConfig
35
+ cluster: ClusterConfig
36
+ parallelism: ParallelismConfig
37
+ }
38
+
39
+ type ExamplePhaseConfig = {
40
+ cluster: ClusterConfig
41
+ training: TrainingConfig
42
+ parallelism: ParallelismConfig
43
+ }
44
+
45
+ type ExamplePreset = {
46
+ label: string
47
+ model: () => ModelConfig
48
+ phases: Record<ExamplePhaseId, ExamplePhaseConfig>
49
+ }
50
+
51
+ const GPU_PRESETS: Record<GpuPresetId, { label: string; spec: () => GPUSpec }> = {
52
+ 'a100-80gb': {
53
+ label: 'A100 80GB',
54
+ spec: a100_80gb,
55
+ },
56
+ 'h100-sxm': {
57
+ label: 'H100 SXM',
58
+ spec: h100_sxm,
59
+ },
60
+ b300: {
61
+ label: 'B300',
62
+ spec: b300,
63
+ },
64
+ gb200: {
65
+ label: 'GB200',
66
+ spec: gb200,
67
+ },
68
+ }
69
+
70
+ const gpuPresetMatches = (candidate: GPUSpec, preset: GPUSpec) =>
71
+ candidate.name === preset.name &&
72
+ candidate.hbmCapacityGB === preset.hbmCapacityGB &&
73
+ candidate.peakTFLOPsBF16 === preset.peakTFLOPsBF16 &&
74
+ candidate.memBandwidthTBs === preset.memBandwidthTBs
75
+
76
+ const h100Cluster = (numNodes: number, nodesPerRack: number): ClusterConfig => ({
77
+ gpuType: h100_sxm(),
78
+ gpusPerNode: 8,
79
+ numNodes,
80
+ intraNodeBandwidthGBs: 900,
81
+ interNodeBandwidthGBs: 50,
82
+ nodesPerRack,
83
+ rackLabel: 'rack',
84
+ nodeLabel: 'GPU host',
85
+ podLabel: 'rack',
86
+ })
87
+
88
+ const b300Cluster = (numNodes: number, nodesPerRack: number): ClusterConfig => ({
89
+ gpuType: b300(),
90
+ gpusPerNode: 8,
91
+ numNodes,
92
+ intraNodeBandwidthGBs: 900,
93
+ interNodeBandwidthGBs: 50,
94
+ nodesPerRack,
95
+ rackLabel: 'rack',
96
+ nodeLabel: 'GPU host',
97
+ podLabel: 'rack',
98
+ })
99
+
100
+ export const EXAMPLE_PRESETS: Record<ExamplePresetId, ExamplePreset> = {
101
+ 'olmo3-32b': {
102
+ label: 'OLMo 3 32B',
103
+ model: olmo3_32B,
104
+ phases: {
105
+ pretraining: {
106
+ cluster: h100Cluster(128, 16),
107
+ training: {
108
+ microBatchSize: 1,
109
+ seqLength: 8192,
110
+ gradAccumSteps: 1,
111
+ precision: 'bf16',
112
+ activationCheckpointing: true,
113
+ optimizer: 'adamw',
114
+ },
115
+ parallelism: {
116
+ tp: 1,
117
+ pp: 1,
118
+ cp: 1,
119
+ ep: 1,
120
+ distributedOptimizer: true,
121
+ fsdpShardGroupSize: 256,
122
+ zeroStage: 3,
123
+ },
124
+ },
125
+ 'long-context': {
126
+ cluster: h100Cluster(32, 8),
127
+ training: {
128
+ microBatchSize: 1,
129
+ seqLength: 65536,
130
+ gradAccumSteps: 1,
131
+ precision: 'bf16',
132
+ activationCheckpointing: true,
133
+ optimizer: 'adamw',
134
+ },
135
+ parallelism: {
136
+ tp: 1,
137
+ pp: 1,
138
+ cp: 8,
139
+ ep: 1,
140
+ distributedOptimizer: true,
141
+ fsdpShardGroupSize: 256,
142
+ zeroStage: 3,
143
+ },
144
+ },
145
+ },
146
+ },
147
+ 'llama31-405b': {
148
+ label: 'Llama 3.1 405B',
149
+ model: llama31_405B,
150
+ phases: {
151
+ pretraining: {
152
+ cluster: h100Cluster(2048, 16),
153
+ training: {
154
+ microBatchSize: 1,
155
+ seqLength: 8192,
156
+ gradAccumSteps: 16,
157
+ precision: 'bf16',
158
+ activationCheckpointing: true,
159
+ optimizer: 'adamw',
160
+ },
161
+ parallelism: {
162
+ tp: 8,
163
+ pp: 16,
164
+ cp: 1,
165
+ ep: 1,
166
+ distributedOptimizer: true,
167
+ fsdpShardGroupSize: 0,
168
+ zeroStage: 1,
169
+ },
170
+ },
171
+ 'long-context': {
172
+ cluster: h100Cluster(2048, 16),
173
+ training: {
174
+ microBatchSize: 1,
175
+ seqLength: 131072,
176
+ gradAccumSteps: 1,
177
+ precision: 'bf16',
178
+ activationCheckpointing: true,
179
+ optimizer: 'adamw',
180
+ },
181
+ parallelism: {
182
+ tp: 8,
183
+ pp: 16,
184
+ cp: 16,
185
+ ep: 1,
186
+ distributedOptimizer: true,
187
+ fsdpShardGroupSize: 0,
188
+ zeroStage: 1,
189
+ },
190
+ },
191
+ },
192
+ },
193
+ 'trinity-large-400b': {
194
+ label: 'Trinity Large 400B',
195
+ model: trinityLarge400B,
196
+ phases: {
197
+ pretraining: {
198
+ cluster: b300Cluster(256, 9),
199
+ training: {
200
+ microBatchSize: 1,
201
+ seqLength: 8192,
202
+ gradAccumSteps: 8,
203
+ precision: 'bf16',
204
+ activationCheckpointing: true,
205
+ optimizer: 'muon',
206
+ },
207
+ parallelism: {
208
+ tp: 1,
209
+ pp: 1,
210
+ cp: 1,
211
+ ep: 8,
212
+ distributedOptimizer: true,
213
+ fsdpShardGroupSize: 128,
214
+ zeroStage: 3,
215
+ },
216
+ },
217
+ 'long-context': {
218
+ cluster: b300Cluster(256, 9),
219
+ training: {
220
+ microBatchSize: 1,
221
+ seqLength: 262144,
222
+ gradAccumSteps: 1,
223
+ precision: 'bf16',
224
+ activationCheckpointing: true,
225
+ optimizer: 'muon',
226
+ },
227
+ parallelism: {
228
+ tp: 1,
229
+ pp: 1,
230
+ cp: 4,
231
+ ep: 8,
232
+ distributedOptimizer: true,
233
+ fsdpShardGroupSize: 128,
234
+ zeroStage: 3,
235
+ },
236
+ },
237
+ },
238
+ },
239
+ }
240
+
241
+ const createWorkbenchConfig = (
242
+ examplePresetId: ExamplePresetId,
243
+ phaseId: ExamplePhaseId,
244
+ ): WorkbenchConfig => {
245
+ const preset = EXAMPLE_PRESETS[examplePresetId]
246
+ const phase = preset.phases[phaseId]
247
+
248
+ return {
249
+ examplePresetId,
250
+ phaseId,
251
+ customized: false,
252
+ model: preset.model(),
253
+ training: { ...phase.training },
254
+ cluster: { ...phase.cluster },
255
+ parallelism: { ...phase.parallelism },
256
+ }
257
+ }
258
+
259
+ const SCENARIOS: Record<WorkbenchScenarioId, WorkbenchConfig> = {
260
+ default: createWorkbenchConfig('olmo3-32b', 'pretraining'),
261
+ 'olmo-pretraining': createWorkbenchConfig('olmo3-32b', 'pretraining'),
262
+ 'olmo-long-context': createWorkbenchConfig('olmo3-32b', 'long-context'),
263
+ 'llama-pretraining': createWorkbenchConfig('llama31-405b', 'pretraining'),
264
+ 'llama-long-context': createWorkbenchConfig('llama31-405b', 'long-context'),
265
+ 'trinity-pretraining': createWorkbenchConfig('trinity-large-400b', 'pretraining'),
266
+ 'trinity-long-context': createWorkbenchConfig('trinity-large-400b', 'long-context'),
267
+ 'infeasible-memory': {
268
+ examplePresetId: 'llama31-405b',
269
+ phaseId: 'pretraining',
270
+ customized: false,
271
+ model: llama31_405B(),
272
+ training: {
273
+ microBatchSize: 1,
274
+ seqLength: 8192,
275
+ gradAccumSteps: 1,
276
+ precision: 'bf16',
277
+ activationCheckpointing: true,
278
+ optimizer: 'adamw',
279
+ },
280
+ cluster: h100Cluster(8, 4),
281
+ parallelism: {
282
+ tp: 8,
283
+ pp: 1,
284
+ cp: 1,
285
+ ep: 1,
286
+ distributedOptimizer: false,
287
+ fsdpShardGroupSize: 0,
288
+ zeroStage: 0,
289
+ },
290
+ },
291
+ }
292
+
293
+ const cloneModel = (model: ModelConfig): ModelConfig => ({
294
+ ...model,
295
+ attentionProfile: model.attentionProfile ? { ...model.attentionProfile } : undefined,
296
+ moe: model.moe ? { ...model.moe } : undefined,
297
+ })
298
+
299
+ const cloneTraining = (training: TrainingConfig): TrainingConfig => ({ ...training })
300
+
301
+ const cloneCluster = (cluster: ClusterConfig): ClusterConfig => ({ ...cluster })
302
+
303
+ const cloneParallelism = (parallelism: ParallelismConfig): ParallelismConfig => ({
304
+ ...parallelism,
305
+ })
306
+
307
+ export const cloneWorkbenchConfig = (config: WorkbenchConfig): WorkbenchConfig => ({
308
+ examplePresetId: config.examplePresetId,
309
+ phaseId: config.phaseId,
310
+ customized: config.customized,
311
+ model: cloneModel(config.model),
312
+ training: cloneTraining(config.training),
313
+ cluster: cloneCluster(config.cluster),
314
+ parallelism: cloneParallelism(config.parallelism),
315
+ })
316
+
317
+ export function getScenarioWorkbenchConfig(scenario: WorkbenchScenarioId) {
318
+ return cloneWorkbenchConfig(SCENARIOS[scenario])
319
+ }
320
+
321
+ export function getExamplePresetOptions() {
322
+ return Object.entries(EXAMPLE_PRESETS)
323
+ .filter(([id]) => id !== 'llama31-405b')
324
+ .map(([id, preset]) => ({
325
+ id: id as ExamplePresetId,
326
+ label: preset.label,
327
+ }))
328
+ }
329
+
330
+ export function getPhaseOptions(examplePresetId: ExamplePresetId) {
331
+ const preset = EXAMPLE_PRESETS[examplePresetId]
332
+
333
+ return Object.keys(preset.phases).map((phaseId) => ({
334
+ id: phaseId as ExamplePhaseId,
335
+ label: phaseId === 'pretraining' ? 'Pretraining' : 'Long-context',
336
+ }))
337
+ }
338
+
339
+ export function getExampleLabel(examplePresetId: ExamplePresetId) {
340
+ return EXAMPLE_PRESETS[examplePresetId].label
341
+ }
342
+
343
+ export function getGpuPresetOptions() {
344
+ return Object.entries(GPU_PRESETS).map(([id, preset]) => ({
345
+ id: id as GpuPresetId,
346
+ label: preset.label,
347
+ }))
348
+ }
349
+
350
+ export function getGpuPresetId(gpuType: GPUSpec): GpuPresetId | 'custom' {
351
+ for (const [id, preset] of Object.entries(GPU_PRESETS)) {
352
+ if (gpuPresetMatches(gpuType, preset.spec())) {
353
+ return id as GpuPresetId
354
+ }
355
+ }
356
+
357
+ return 'custom'
358
+ }
359
+
360
+ export function applyGpuPreset(config: WorkbenchConfig, gpuPresetId: GpuPresetId): WorkbenchConfig {
361
+ return {
362
+ ...config,
363
+ customized: true,
364
+ cluster: {
365
+ ...config.cluster,
366
+ gpuType: GPU_PRESETS[gpuPresetId].spec(),
367
+ },
368
+ }
369
+ }
370
+
371
+ export function applyExamplePreset(
372
+ _config: WorkbenchConfig,
373
+ examplePresetId: ExamplePresetId,
374
+ ): WorkbenchConfig {
375
+ return createWorkbenchConfig(examplePresetId, 'pretraining')
376
+ }
377
+
378
+ export function applyExamplePhase(
379
+ config: WorkbenchConfig,
380
+ phaseId: ExamplePhaseId,
381
+ ): WorkbenchConfig {
382
+ return createWorkbenchConfig(config.examplePresetId, phaseId)
383
+ }
384
+
385
+ export function getFactorOptions(total: number, currentValue: number) {
386
+ const factors = new Set<number>([currentValue])
387
+
388
+ for (let candidate = 1; candidate <= total; candidate += 1) {
389
+ if (total % candidate === 0) {
390
+ factors.add(candidate)
391
+ }
392
+ }
393
+
394
+ return Array.from(factors).sort((left, right) => left - right)
395
+ }
src/lib/workbenchPresenter.ts ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { type ClusterAnalysis } from './trainingClusterModel'
2
+ import { getExampleLabel, type WorkbenchConfig } from './workbench'
3
+
4
+ export type WorkbenchViewModel = {
5
+ config: WorkbenchConfig
6
+ analysis: ClusterAnalysis
7
+ structuralIssue: boolean
8
+ warnings: string[]
9
+ headline: string
10
+ subheadline: string
11
+ summary: {
12
+ throughputLabel: string
13
+ throughputNote: string
14
+ gpuLabel: string
15
+ gpuNote: string
16
+ interconnectLabel: string
17
+ interconnectNote: string
18
+ bottleneckLabel: string
19
+ bottleneckNote: string
20
+ }
21
+ facts: Array<{ label: string; value: string }>
22
+ }
23
+
24
+ const formatInteger = (value: number) => Math.round(value).toLocaleString()
25
+ const formatPercent = (value: number) => `${Math.round(value * 100)}%`
26
+ const formatPercentWhole = (value: number) => `${Math.round(value)}%`
27
+ const formatGB = (value: number) => `${value.toFixed(value >= 100 ? 0 : 1)} GB`
28
+
29
+ const getAllocatedGpuCount = (analysis: ClusterAnalysis) =>
30
+ analysis.gpuMap.filter((gpu) => gpu.memoryUsedGB > 0).length
31
+
32
+ const getActiveGpuCount = (analysis: ClusterAnalysis) =>
33
+ analysis.gpuMap.filter((gpu) => gpu.isActive).length
34
+
35
+ const getDominantCommLabel = (analysis: ClusterAnalysis) => {
36
+ const entries = [
37
+ ['TP collectives', analysis.communication.tp.timePerStepMs],
38
+ ['PP activations', analysis.communication.pp.timePerStepMs],
39
+ ['CP sequence exchange', analysis.communication.cp.timePerStepMs],
40
+ ['FSDP sharding', analysis.communication.fsdp.timePerStepMs],
41
+ ['EP routing', analysis.communication.ep.timePerStepMs],
42
+ ['DP sync', analysis.communication.dp.allReduceTimeMs],
43
+ ['Pipeline bubble', analysis.throughput.pipelineBubbleTimeMs],
44
+ ] as const
45
+
46
+ return [...entries].sort((left, right) => right[1] - left[1])[0][0]
47
+ }
48
+
49
+ export function buildWorkbenchViewModel(
50
+ config: WorkbenchConfig,
51
+ analysis: ClusterAnalysis,
52
+ ): WorkbenchViewModel {
53
+ const requestedGpuCount =
54
+ config.parallelism.tp *
55
+ config.parallelism.pp *
56
+ config.parallelism.cp *
57
+ config.parallelism.ep *
58
+ analysis.derivedParallelism.dp
59
+ const allocatedGpuCount = getAllocatedGpuCount(analysis)
60
+ const activeGpuCount = getActiveGpuCount(analysis)
61
+ const totalGPUs = analysis.totalGPUs
62
+ const launchedGpuCount =
63
+ analysis.throughput.totalStepTimeMs > 0 ? Math.min(requestedGpuCount, totalGPUs) : 0
64
+ const darkGpuCount = Math.max(totalGPUs - launchedGpuCount, 0)
65
+ const nodesPerRack = config.cluster.nodesPerRack ?? config.cluster.numNodes
66
+ const rackCount = Math.ceil(config.cluster.numNodes / nodesPerRack)
67
+ const rackLabel = config.cluster.rackLabel ?? 'rack'
68
+ const nodeLabel = config.cluster.nodeLabel ?? 'node'
69
+ const structuralIssue = !analysis.feasible && analysis.throughput.totalStepTimeMs === 0
70
+ const warnings: string[] = []
71
+
72
+ if (!analysis.feasible && analysis.infeasibilityReason) {
73
+ warnings.push(analysis.infeasibilityReason)
74
+ }
75
+
76
+ if (structuralIssue) {
77
+ warnings.push('This layout is structurally invalid, so throughput and communication are not estimated.')
78
+ } else if (!analysis.feasible) {
79
+ warnings.push('The run is memory-infeasible, but the app still shows the attempted placement and estimated traffic.')
80
+ }
81
+
82
+ if (analysis.memoryBreakdown.utilizationPercent >= 92) {
83
+ warnings.push(
84
+ `Worst-case GPU HBM is ${formatPercentWhole(analysis.memoryBreakdown.utilizationPercent)} full.`,
85
+ )
86
+ }
87
+
88
+ if (analysis.throughput.pipelineBubbleFraction >= 0.18) {
89
+ warnings.push(
90
+ `Pipeline bubble is ${formatPercent(analysis.throughput.pipelineBubbleFraction)} of step time.`,
91
+ )
92
+ }
93
+
94
+ if (config.parallelism.cp > 1) {
95
+ warnings.push(
96
+ `CP shards each micro-batch into ${config.parallelism.cp} sequence slices and adds sequence exchange traffic.`,
97
+ )
98
+ }
99
+
100
+ if (config.parallelism.fsdpShardGroupSize > 1) {
101
+ warnings.push(
102
+ `HSDP shards weights across ${config.parallelism.fsdpShardGroupSize.toLocaleString()}-GPU groups, with ${analysis.derivedParallelism.replicaGroups} replica groups syncing once per step.`,
103
+ )
104
+ }
105
+
106
+ if (config.parallelism.ep > 1) {
107
+ warnings.push(
108
+ `EP routes tokens across ${config.parallelism.ep} expert lanes and adds expert all-to-all traffic.`,
109
+ )
110
+ }
111
+
112
+ if (!structuralIssue && darkGpuCount > 0) {
113
+ warnings.push(
114
+ `${darkGpuCount.toLocaleString()} GPUs are dark because this launch only uses ${launchedGpuCount.toLocaleString()} ranks.`,
115
+ )
116
+ }
117
+
118
+ const throughputLabel = structuralIssue
119
+ ? 'n/a'
120
+ : formatInteger(analysis.throughput.tokensPerSecond)
121
+ const throughputNote = structuralIssue
122
+ ? 'structural constraint violated'
123
+ : !analysis.feasible
124
+ ? 'estimated despite HBM overflow'
125
+ : 'tokens / second'
126
+ const interconnectUtilization = Math.max(
127
+ analysis.communication.tp.linkUtilizationPercent,
128
+ analysis.communication.pp.usesInterNode
129
+ ? analysis.communication.pp.timePerStepMs > 0
130
+ ? analysis.communication.pp.totalVolumePerStepGB > 0
131
+ ? Math.min(
132
+ 100,
133
+ (analysis.communication.pp.totalVolumePerStepGB /
134
+ (config.cluster.interNodeBandwidthGBs *
135
+ (analysis.throughput.totalStepTimeMs / 1000 || 1))) *
136
+ 100,
137
+ )
138
+ : 0
139
+ : 0
140
+ : 0,
141
+ analysis.communication.cp.linkUtilizationPercent,
142
+ analysis.communication.fsdp.linkUtilizationPercent,
143
+ analysis.communication.ep.linkUtilizationPercent,
144
+ analysis.communication.dp.linkUtilizationPercent,
145
+ )
146
+ const headlineGpuLabel =
147
+ structuralIssue || launchedGpuCount === totalGPUs
148
+ ? `${totalGPUs.toLocaleString()} GPUs`
149
+ : `${launchedGpuCount.toLocaleString()} of ${totalGPUs.toLocaleString()} GPUs`
150
+
151
+ return {
152
+ config,
153
+ analysis,
154
+ structuralIssue,
155
+ warnings,
156
+ headline:
157
+ `${getExampleLabel(config.examplePresetId)}${config.customized ? ' (customized)' : ''} · ` +
158
+ `${config.phaseId} on ${headlineGpuLabel}`,
159
+ subheadline:
160
+ `${formatInteger(analysis.totalParams)} total params, ${formatInteger(analysis.activeParamsPerToken)} active params, ` +
161
+ `${config.model.numLayers} layers, ` +
162
+ `${rackCount} ${rackLabel}${rackCount === 1 ? '' : 's'} of ${config.cluster.gpuType.name}.`,
163
+ summary: {
164
+ throughputLabel,
165
+ throughputNote,
166
+ gpuLabel: `${activeGpuCount}/${launchedGpuCount || allocatedGpuCount || totalGPUs}`,
167
+ gpuNote:
168
+ structuralIssue
169
+ ? 'launch invalid'
170
+ : launchedGpuCount === totalGPUs
171
+ ? 'active in this placement'
172
+ : `${launchedGpuCount}/${totalGPUs} launched on this cluster`,
173
+ interconnectLabel: formatPercentWhole(interconnectUtilization),
174
+ interconnectNote: 'peak link utilization',
175
+ bottleneckLabel: analysis.feasible ? getDominantCommLabel(analysis) : 'HBM capacity',
176
+ bottleneckNote: analysis.feasible
177
+ ? `${formatGB(analysis.memoryBreakdown.totalGB)} on the hottest GPU`
178
+ : analysis.infeasibilityReason ?? 'constraint violation',
179
+ },
180
+ facts: [
181
+ {
182
+ label: 'Model',
183
+ value: `${formatInteger(analysis.totalParams)} params`,
184
+ },
185
+ {
186
+ label: 'Context',
187
+ value: `${config.training.seqLength.toLocaleString()} tokens`,
188
+ },
189
+ {
190
+ label: 'Global batch',
191
+ value: `${analysis.globalBatchSizeTokens.toLocaleString()} tokens / step`,
192
+ },
193
+ {
194
+ label: 'Topology',
195
+ value: `${config.cluster.numNodes} ${nodeLabel}${config.cluster.numNodes === 1 ? '' : 's'}`,
196
+ },
197
+ {
198
+ label: 'Parallelism',
199
+ value:
200
+ `TP ${config.parallelism.tp} · PP ${config.parallelism.pp} · ` +
201
+ `CP ${config.parallelism.cp} · EP ${config.parallelism.ep} · DP ${analysis.derivedParallelism.dp}`,
202
+ },
203
+ {
204
+ label: 'Replica groups',
205
+ value: `${analysis.derivedParallelism.replicaGroups} groups`,
206
+ },
207
+ {
208
+ label: 'FSDP group',
209
+ value:
210
+ config.parallelism.fsdpShardGroupSize > 1
211
+ ? `${config.parallelism.fsdpShardGroupSize.toLocaleString()} GPUs`
212
+ : 'disabled',
213
+ },
214
+ {
215
+ label: 'HBM headroom',
216
+ value: `${formatGB(config.cluster.gpuType.hbmCapacityGB - analysis.memoryBreakdown.totalGB)}`,
217
+ },
218
+ ],
219
+ }
220
+ }
src/main.tsx ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from 'react'
2
+ import { createRoot } from 'react-dom/client'
3
+ import '@fontsource/space-grotesk/400.css'
4
+ import '@fontsource/space-grotesk/500.css'
5
+ import '@fontsource/space-grotesk/700.css'
6
+ import '@fontsource/ibm-plex-mono/400.css'
7
+ import '@fontsource/ibm-plex-mono/500.css'
8
+ import './index.css'
9
+ import App from './App.tsx'
10
+
11
+ createRoot(document.getElementById('root')!).render(
12
+ <StrictMode>
13
+ <App />
14
+ </StrictMode>,
15
+ )
src/types/global.d.ts ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export {}
2
+
3
+ declare global {
4
+ type TopologyDebugState = {
5
+ ready: boolean
6
+ viewport: {
7
+ x: number
8
+ y: number
9
+ scale: number
10
+ }
11
+ surfaceSize: {
12
+ width: number
13
+ height: number
14
+ }
15
+ objectCounts: {
16
+ pods: number
17
+ nodes: number
18
+ gpus: number
19
+ links: number
20
+ activeGpus: number
21
+ contextualNodes: number
22
+ }
23
+ objects: Record<
24
+ string,
25
+ {
26
+ x: number
27
+ y: number
28
+ width: number
29
+ height: number
30
+ }
31
+ >
32
+ hoveredTarget: {
33
+ kind: 'pod' | 'node' | 'gpu' | 'link'
34
+ id: string
35
+ } | null
36
+ pinnedTarget: {
37
+ kind: 'pod' | 'node' | 'gpu' | 'link'
38
+ id: string
39
+ } | null
40
+ detailLevel?: 'overview' | 'board' | 'package' | 'silicon' | 'micro'
41
+ setViewport?: (viewport: { x: number; y: number; scale: number }) => void
42
+ }
43
+
44
+ interface Window {
45
+ __PIXI_TOPOLOGY_APP__?: unknown
46
+ __PIXI_FLOW_APP__?: unknown
47
+ __TOPOLOGY_DEBUG__?: TopologyDebugState
48
+ }
49
+ }
tests/topology.spec.ts ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { expect, test, type Page } from '@playwright/test'
2
+
3
+ type DebugObjectMap = Record<
4
+ string,
5
+ {
6
+ x: number
7
+ y: number
8
+ width: number
9
+ height: number
10
+ }
11
+ >
12
+
13
+ async function gotoScenario(page: Page, query = '') {
14
+ await page.goto(`/?snapshot=1${query}`)
15
+ await page.waitForLoadState('networkidle')
16
+ await page.waitForFunction(() => {
17
+ const debug = window.__TOPOLOGY_DEBUG__ as
18
+ | { ready?: boolean; objects?: DebugObjectMap }
19
+ | undefined
20
+
21
+ return Boolean(debug?.ready && debug.objects && Object.keys(debug.objects).length > 0)
22
+ })
23
+ await page.evaluate(async () => {
24
+ await document.fonts.ready
25
+ })
26
+ }
27
+
28
+ async function getDebugObject(page: Page, id: string) {
29
+ return page.evaluate((objectId) => {
30
+ const debug = window.__TOPOLOGY_DEBUG__ as { objects: DebugObjectMap }
31
+ return debug.objects[objectId]
32
+ }, id)
33
+ }
34
+
35
+ async function getFirstObjectId(page: Page, prefix: string) {
36
+ return page.evaluate((value) => {
37
+ const debug = window.__TOPOLOGY_DEBUG__ as { objects: DebugObjectMap }
38
+ return Object.keys(debug.objects).find((key) => key.startsWith(value)) ?? null
39
+ }, prefix)
40
+ }
41
+
42
+ async function getSurfaceOffset(page: Page) {
43
+ const layer = page.getByTestId('topology-interaction-layer')
44
+ await layer.scrollIntoViewIfNeeded()
45
+ const box = await layer.boundingBox()
46
+ if (!box) {
47
+ throw new Error('missing topology interaction layer')
48
+ }
49
+
50
+ return box
51
+ }
52
+
53
+ async function objectCenter(page: Page, id: string) {
54
+ const object = await getDebugObject(page, id)
55
+ const surface = await getSurfaceOffset(page)
56
+
57
+ return {
58
+ x: surface.x + object.x + object.width / 2,
59
+ y: surface.y + object.y + object.height / 2,
60
+ }
61
+ }
62
+
63
+ test('default scenario screenshot', async ({ page }) => {
64
+ await gotoScenario(page)
65
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-default.png')
66
+ })
67
+
68
+ test('olmo pretraining screenshot', async ({ page }) => {
69
+ await gotoScenario(page, '&scenario=olmo-pretraining')
70
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-olmo-pretraining.png')
71
+ })
72
+
73
+ test('llama pretraining screenshot', async ({ page }) => {
74
+ await gotoScenario(page, '&scenario=llama-pretraining')
75
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-llama-pretraining.png')
76
+ })
77
+
78
+ test('trinity pretraining screenshot', async ({ page }) => {
79
+ await gotoScenario(page, '&scenario=trinity-pretraining')
80
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-trinity-pretraining.png')
81
+ })
82
+
83
+ test('olmo long-context screenshot', async ({ page }) => {
84
+ await gotoScenario(page, '&scenario=olmo-long-context')
85
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-olmo-long-context.png')
86
+ })
87
+
88
+ test('trinity long-context screenshot', async ({ page }) => {
89
+ await gotoScenario(page, '&scenario=trinity-long-context')
90
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-trinity-long-context.png')
91
+ })
92
+
93
+ test('infeasible memory screenshot', async ({ page }) => {
94
+ await gotoScenario(page, '&scenario=infeasible-memory')
95
+ await expect(page.getByTestId('infeasible-banner')).toBeVisible()
96
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-infeasible-memory.png')
97
+ })
98
+
99
+ test('hover highlight screenshot', async ({ page }) => {
100
+ await gotoScenario(page, '&scenario=olmo-pretraining')
101
+ const nodeId = await getFirstObjectId(page, 'node-')
102
+ if (!nodeId) {
103
+ throw new Error('missing visible node object')
104
+ }
105
+
106
+ const object = await getDebugObject(page, nodeId)
107
+ const surface = await getSurfaceOffset(page)
108
+ const target = {
109
+ x: surface.x + object.x + 6,
110
+ y: surface.y + object.y + 6,
111
+ }
112
+ await page.mouse.move(target.x, target.y)
113
+ await page.waitForFunction((id) => {
114
+ const debug = window.__TOPOLOGY_DEBUG__ as { hoveredTarget?: { id: string } | null }
115
+ return debug.hoveredTarget?.id === id
116
+ }, nodeId)
117
+ await expect(page.getByTestId('topology-inspector')).toContainText(/host/i)
118
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-hover-node.png')
119
+ })
120
+
121
+ test('pinned inspector screenshot', async ({ page }) => {
122
+ await gotoScenario(page, '&scenario=olmo-pretraining')
123
+ const gpuId = await getFirstObjectId(page, 'gpu-')
124
+ if (!gpuId) {
125
+ throw new Error('missing visible gpu object')
126
+ }
127
+
128
+ const target = await objectCenter(page, gpuId)
129
+ await page.mouse.click(target.x, target.y)
130
+ await page.waitForFunction((id) => {
131
+ const debug = window.__TOPOLOGY_DEBUG__ as { pinnedTarget?: { id: string } | null }
132
+ return debug.pinnedTarget?.id === id
133
+ }, gpuId)
134
+ await expect(page.getByTestId('topology-inspector')).toContainText('GPU')
135
+ await expect(page.getByTestId('topology-scene')).toHaveScreenshot('topology-pinned-gpu.png')
136
+ })
137
+
138
+ test('debug overlay screenshot', async ({ page }) => {
139
+ await gotoScenario(page, '&debug=1')
140
+ await expect(page.getByTestId('topology-debug')).toBeVisible()
141
+ await expect(page.getByTestId('topology-interaction-layer')).toHaveScreenshot(
142
+ 'topology-debug-overlay.png',
143
+ )
144
+ })
145
+
146
+ test('supports zoom pan and reset camera', async ({ page }) => {
147
+ await gotoScenario(page)
148
+ const layer = page.getByTestId('topology-interaction-layer')
149
+ await layer.scrollIntoViewIfNeeded()
150
+ const before = await page.evaluate(() => {
151
+ return (window.__TOPOLOGY_DEBUG__ as { viewport: { scale: number; x: number } }).viewport
152
+ })
153
+ const scrollBefore = await page.evaluate(() => window.scrollY)
154
+
155
+ const box = await layer.boundingBox()
156
+ if (!box) {
157
+ throw new Error('missing interaction layer bounds')
158
+ }
159
+
160
+ await page.mouse.move(box.x + box.width / 2, box.y + box.height / 2)
161
+ await page.mouse.wheel(0, -320)
162
+ await page.waitForFunction((scale) => {
163
+ const debug = window.__TOPOLOGY_DEBUG__ as { viewport: { scale: number } }
164
+ return debug.viewport.scale > scale
165
+ }, before.scale)
166
+
167
+ const afterZoom = await page.evaluate(() => {
168
+ return (window.__TOPOLOGY_DEBUG__ as { viewport: { scale: number; x: number } }).viewport
169
+ })
170
+ expect(afterZoom.scale).toBeGreaterThan(before.scale)
171
+ expect(await page.evaluate(() => window.scrollY)).toBe(scrollBefore)
172
+
173
+ await page.mouse.down()
174
+ await page.mouse.move(box.x + box.width / 2 + 80, box.y + box.height / 2 + 60, {
175
+ steps: 6,
176
+ })
177
+ await page.mouse.up()
178
+ await page.waitForFunction((x) => {
179
+ const debug = window.__TOPOLOGY_DEBUG__ as { viewport: { x: number } }
180
+ return debug.viewport.x !== x
181
+ }, afterZoom.x)
182
+
183
+ const afterPan = await page.evaluate(() => {
184
+ return (window.__TOPOLOGY_DEBUG__ as { viewport: { x: number } }).viewport
185
+ })
186
+ expect(afterPan.x).not.toBe(afterZoom.x)
187
+
188
+ await page.getByTestId('camera-reset').click()
189
+ await page.waitForFunction((scale) => {
190
+ const debug = window.__TOPOLOGY_DEBUG__ as { viewport: { scale: number } }
191
+ return Math.abs(debug.viewport.scale - scale) < 0.01
192
+ }, before.scale)
193
+
194
+ const afterReset = await page.evaluate(() => {
195
+ return (window.__TOPOLOGY_DEBUG__ as { viewport: { scale: number } }).viewport
196
+ })
197
+ expect(Math.abs(afterReset.scale - before.scale)).toBeLessThan(0.01)
198
+ })
199
+
200
+ test('supports pin and unpin via click', async ({ page }) => {
201
+ await gotoScenario(page, '&scenario=olmo-pretraining')
202
+ const gpuId = await getFirstObjectId(page, 'gpu-')
203
+ if (!gpuId) {
204
+ throw new Error('missing visible gpu object')
205
+ }
206
+
207
+ const gpu = await objectCenter(page, gpuId)
208
+
209
+ await page.mouse.click(gpu.x, gpu.y)
210
+ await page.waitForFunction((id) => {
211
+ const debug = window.__TOPOLOGY_DEBUG__ as { pinnedTarget?: { id: string } | null }
212
+ return debug.pinnedTarget?.id === id
213
+ }, gpuId)
214
+ await expect(page.getByTestId('topology-inspector')).toContainText('GPU')
215
+
216
+ await page.mouse.click(gpu.x, gpu.y)
217
+ await page.waitForFunction(() => {
218
+ const debug = window.__TOPOLOGY_DEBUG__ as { pinnedTarget?: { id: string } | null }
219
+ return debug.pinnedTarget == null
220
+ })
221
+ await expect(page.getByTestId('topology-inspector')).toContainText('Hover target')
222
+ await expect(page.getByTestId('topology-inspector')).toContainText('GPU')
223
+ })
224
+
225
+ test('supports manual model and cluster edits beyond the example presets', async ({ page }) => {
226
+ await gotoScenario(page, '&scenario=olmo-pretraining')
227
+
228
+ await page.getByLabel('Hidden dim').fill('6144')
229
+ await expect(page.locator('.control-badge', { hasText: 'customized' })).toBeVisible()
230
+ await expect(page.getByText(/hidden 6,144/i)).toBeVisible()
231
+
232
+ await page.getByRole('spinbutton', { name: 'Nodes', exact: true }).fill('64')
233
+ await expect(page.getByText('512 GPUs in cluster')).toBeVisible()
234
+ })
tests/topologyLod.test.ts ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, expect, it } from 'vitest'
2
+
3
+ import { getTopologyLodState } from '../src/lib/topologyLod'
4
+
5
+ describe('topology lod policy', () => {
6
+ it('keeps overview strongest at cluster-scale zoom', () => {
7
+ const lod = getTopologyLodState(0.05)
8
+
9
+ expect(lod.primaryBand).toBe('overview')
10
+ expect(lod.weights.overview).toBe(1)
11
+ expect(lod.weights.board).toBe(0)
12
+ })
13
+
14
+ it('cross-fades only between adjacent detail bands', () => {
15
+ const boardToPackage = getTopologyLodState(2.8)
16
+ const packageLod = getTopologyLodState(6.5)
17
+ const siliconLod = getTopologyLodState(40)
18
+
19
+ expect(boardToPackage.weights.board).toBeGreaterThan(0)
20
+ expect(boardToPackage.weights.package).toBeGreaterThan(0)
21
+ expect(boardToPackage.weights.silicon).toBe(0)
22
+ expect(packageLod.weights.package).toBeGreaterThan(0.4)
23
+ expect(packageLod.weights.board).toBe(0)
24
+ expect(siliconLod.primaryBand).toBe('silicon')
25
+ expect(siliconLod.weights.package).toBe(0)
26
+ })
27
+
28
+ it('activates deep isolation only at extreme gpu zoom', () => {
29
+ const shallow = getTopologyLodState(4)
30
+ const deep = getTopologyLodState(140)
31
+
32
+ expect(shallow.deepIsolation).toBeLessThan(0.1)
33
+ expect(deep.deepIsolation).toBeGreaterThan(0.8)
34
+ expect(deep.weights.micro).toBeGreaterThan(0.5)
35
+ })
36
+
37
+ it('keeps lod weights normalized to a single active blend', () => {
38
+ const scales = [0.05, 0.2, 1.1, 3, 8, 24, 110]
39
+
40
+ for (const scale of scales) {
41
+ const lod = getTopologyLodState(scale)
42
+ const total = Object.values(lod.weights).reduce((sum, value) => sum + value, 0)
43
+ const activeBands = Object.values(lod.weights).filter((value) => value > 0.001).length
44
+
45
+ expect(total).toBeCloseTo(1, 4)
46
+ expect(activeBands).toBeLessThanOrEqual(2)
47
+ }
48
+ })
49
+ })
tests/topologySceneModel.test.ts ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, expect, it } from 'vitest'
2
+
3
+ import { buildTopologySceneModel, describeTarget } from '../src/lib/topologyScene'
4
+ import { analyzeCluster } from '../src/lib/trainingClusterModel'
5
+ import { buildWorkbenchViewModel } from '../src/lib/workbenchPresenter'
6
+ import { getScenarioWorkbenchConfig } from '../src/lib/workbench'
7
+
8
+ describe('topology scene model', () => {
9
+ it('groups nodes into racks using cluster metadata', () => {
10
+ const config = getScenarioWorkbenchConfig('trinity-pretraining')
11
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
12
+ const viewModel = buildWorkbenchViewModel(config, analysis)
13
+ const scene = buildTopologySceneModel(viewModel)
14
+
15
+ expect(scene.pods.length).toBeGreaterThan(1)
16
+ expect(scene.nodes).toHaveLength(config.cluster.numNodes)
17
+ expect(scene.objectCounts.gpus).toBe(config.cluster.numNodes * config.cluster.gpusPerNode)
18
+ expect(scene.lodPolicy.maxScale).toBeGreaterThan(100)
19
+ })
20
+
21
+ it('describes GPUs with analysis-backed shard indices, including EP and FSDP', () => {
22
+ const config = getScenarioWorkbenchConfig('trinity-pretraining')
23
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
24
+ const viewModel = buildWorkbenchViewModel(config, analysis)
25
+ const scene = buildTopologySceneModel(viewModel)
26
+ const gpu = scene.nodes.flatMap((node) => node.gpus).find((item) => item.memoryUsedGB > 0)
27
+ if (!gpu) {
28
+ throw new Error('expected at least one allocated gpu')
29
+ }
30
+
31
+ const details = describeTarget(scene, viewModel, { kind: 'gpu', id: gpu.id })
32
+ expect(details?.metrics.some((metric) => metric.label === 'Expert lane')).toBe(true)
33
+ expect(details?.metrics.some((metric) => metric.label === 'FSDP rank')).toBe(true)
34
+ })
35
+
36
+ it('keeps the scene renderable for infeasible configurations', () => {
37
+ const config = getScenarioWorkbenchConfig('infeasible-memory')
38
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
39
+ const viewModel = buildWorkbenchViewModel(config, analysis)
40
+ const scene = buildTopologySceneModel(viewModel)
41
+
42
+ expect(analysis.feasible).toBe(false)
43
+ expect(scene.nodes.length).toBeGreaterThan(0)
44
+ expect(viewModel.warnings[0]).toContain('exceeding')
45
+ })
46
+
47
+ it('exposes EP traffic links in the Trinity preset', () => {
48
+ const config = getScenarioWorkbenchConfig('trinity-pretraining')
49
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
50
+ const viewModel = buildWorkbenchViewModel(config, analysis)
51
+ const scene = buildTopologySceneModel(viewModel)
52
+
53
+ expect(scene.rowLinks.concat(scene.columnLinks, scene.busLinks).some((link) => link.trafficType === 'ep')).toBe(true)
54
+ })
55
+
56
+ it('exposes CP traffic links in the OLMo long-context preset', () => {
57
+ const config = getScenarioWorkbenchConfig('olmo-long-context')
58
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
59
+ const viewModel = buildWorkbenchViewModel(config, analysis)
60
+ const scene = buildTopologySceneModel(viewModel)
61
+
62
+ expect(scene.rowLinks.concat(scene.columnLinks, scene.busLinks).some((link) => link.trafficType === 'cp')).toBe(true)
63
+ })
64
+
65
+ it('collapses cross-rack links to rack centers instead of drawing node-to-node lines across racks', () => {
66
+ const config = getScenarioWorkbenchConfig('llama-pretraining')
67
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
68
+ const viewModel = buildWorkbenchViewModel(config, analysis)
69
+ const scene = buildTopologySceneModel(viewModel)
70
+ const rackLink = scene.rowLinks
71
+ .concat(scene.columnLinks)
72
+ .find((link) => link.scope === 'rack' && link.transport === 'infiniband')
73
+
74
+ expect(rackLink).toBeDefined()
75
+ expect(scene.pods.some((pod) => pod.centerX === rackLink?.x1 && pod.centerY === rackLink?.y1)).toBe(true)
76
+ expect(scene.pods.some((pod) => pod.centerX === rackLink?.x2 && pod.centerY === rackLink?.y2)).toBe(true)
77
+ })
78
+
79
+ it('keeps stable focus and lod frames for every gpu', () => {
80
+ const config = getScenarioWorkbenchConfig('llama-pretraining')
81
+ const analysis = analyzeCluster(config.model, config.training, config.cluster, config.parallelism)
82
+ const viewModel = buildWorkbenchViewModel(config, analysis)
83
+ const scene = buildTopologySceneModel(viewModel)
84
+ const gpus = scene.nodes.flatMap((node) => node.gpus)
85
+
86
+ expect(gpus.length).toBe(scene.objectCounts.gpus)
87
+ expect(gpus.every((gpu) => gpu.focusFrame.width >= gpu.width && gpu.lodFrame.width === gpu.width)).toBe(true)
88
+ expect(gpus.every((gpu) => gpu.focusFrame.height >= gpu.height && gpu.lodFrame.height === gpu.height)).toBe(true)
89
+ })
90
+ })
tests/trainingClusterModel.test.ts ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, expect, it } from 'vitest'
2
+
3
+ import {
4
+ a100_80gb,
5
+ analyzeCluster,
6
+ b300,
7
+ cluster64GPU,
8
+ h100_sxm,
9
+ llama70B,
10
+ llama7B,
11
+ llama31_405B,
12
+ olmo3_32B,
13
+ singleNode8GPU,
14
+ trinityLarge400B,
15
+ type ClusterConfig,
16
+ type TrainingConfig,
17
+ } from '../src/lib/trainingClusterModel'
18
+
19
+ const baselineTraining: TrainingConfig = {
20
+ microBatchSize: 1,
21
+ seqLength: 2048,
22
+ gradAccumSteps: 8,
23
+ precision: 'bf16',
24
+ activationCheckpointing: true,
25
+ optimizer: 'adamw',
26
+ }
27
+
28
+ describe('trainingClusterModel', () => {
29
+ it('fits Llama 2 7B on 8x A100 80GB with TP=8 and derived DP=1', () => {
30
+ const analysis = analyzeCluster(llama7B(), baselineTraining, singleNode8GPU(a100_80gb()), {
31
+ tp: 8,
32
+ pp: 1,
33
+ cp: 1,
34
+ ep: 1,
35
+ distributedOptimizer: false,
36
+ fsdpShardGroupSize: 0,
37
+ zeroStage: 0,
38
+ })
39
+
40
+ expect(analysis.feasible).toBe(true)
41
+ expect(analysis.derivedParallelism.dp).toBe(1)
42
+ expect(analysis.memoryBreakdown.totalGB).toBeLessThan(80)
43
+ })
44
+
45
+ it('marks Llama 2 70B on 8x A100 80GB as infeasible for unsharded Adam training', () => {
46
+ const analysis = analyzeCluster(llama70B(), baselineTraining, singleNode8GPU(a100_80gb()), {
47
+ tp: 8,
48
+ pp: 1,
49
+ cp: 1,
50
+ ep: 1,
51
+ distributedOptimizer: false,
52
+ fsdpShardGroupSize: 0,
53
+ zeroStage: 0,
54
+ })
55
+
56
+ expect(analysis.feasible).toBe(false)
57
+ expect(analysis.infeasibilityReason).toContain('exceeding 80 GB of HBM')
58
+ })
59
+
60
+ it('keeps MFU in a realistic range for a balanced 64x H100 dense run', () => {
61
+ const analysis = analyzeCluster(
62
+ llama70B(),
63
+ {
64
+ ...baselineTraining,
65
+ seqLength: 4096,
66
+ gradAccumSteps: 16,
67
+ },
68
+ cluster64GPU(h100_sxm()),
69
+ {
70
+ tp: 4,
71
+ pp: 4,
72
+ cp: 1,
73
+ ep: 1,
74
+ distributedOptimizer: true,
75
+ fsdpShardGroupSize: 0,
76
+ zeroStage: 1,
77
+ },
78
+ )
79
+
80
+ expect(analysis.feasible).toBe(true)
81
+ expect(analysis.derivedParallelism.dp).toBe(4)
82
+ expect(analysis.throughput.mfu).toBeGreaterThan(0.3)
83
+ expect(analysis.throughput.mfu).toBeLessThanOrEqual(0.62)
84
+ })
85
+
86
+ it('reduces activation memory when CP increases and adds CP communication', () => {
87
+ const withoutCp = analyzeCluster(
88
+ llama70B(),
89
+ {
90
+ ...baselineTraining,
91
+ seqLength: 4096,
92
+ },
93
+ cluster64GPU(h100_sxm()),
94
+ {
95
+ tp: 2,
96
+ pp: 2,
97
+ cp: 1,
98
+ ep: 1,
99
+ distributedOptimizer: true,
100
+ fsdpShardGroupSize: 0,
101
+ zeroStage: 1,
102
+ },
103
+ )
104
+ const withCp = analyzeCluster(
105
+ llama70B(),
106
+ {
107
+ ...baselineTraining,
108
+ seqLength: 4096,
109
+ },
110
+ cluster64GPU(h100_sxm()),
111
+ {
112
+ tp: 2,
113
+ pp: 2,
114
+ cp: 4,
115
+ ep: 1,
116
+ distributedOptimizer: true,
117
+ fsdpShardGroupSize: 0,
118
+ zeroStage: 1,
119
+ },
120
+ )
121
+
122
+ expect(withCp.memoryBreakdown.activationsGB).toBeLessThan(withoutCp.memoryBreakdown.activationsGB)
123
+ expect(withCp.communication.cp.totalVolumePerStepGB).toBeGreaterThan(0)
124
+ })
125
+
126
+ it('reduces OLMo memory with HSDP shard groups compared with plain DP', () => {
127
+ const cluster = {
128
+ ...cluster64GPU(h100_sxm()),
129
+ numNodes: 128,
130
+ nodesPerRack: 16,
131
+ }
132
+ const plain = analyzeCluster(
133
+ olmo3_32B(),
134
+ {
135
+ microBatchSize: 1,
136
+ seqLength: 8192,
137
+ gradAccumSteps: 1,
138
+ precision: 'bf16',
139
+ activationCheckpointing: true,
140
+ optimizer: 'adamw',
141
+ },
142
+ cluster,
143
+ {
144
+ tp: 1,
145
+ pp: 1,
146
+ cp: 1,
147
+ ep: 1,
148
+ distributedOptimizer: false,
149
+ fsdpShardGroupSize: 0,
150
+ zeroStage: 0,
151
+ },
152
+ )
153
+ const hsdp = analyzeCluster(
154
+ olmo3_32B(),
155
+ {
156
+ microBatchSize: 1,
157
+ seqLength: 8192,
158
+ gradAccumSteps: 1,
159
+ precision: 'bf16',
160
+ activationCheckpointing: true,
161
+ optimizer: 'adamw',
162
+ },
163
+ cluster,
164
+ {
165
+ tp: 1,
166
+ pp: 1,
167
+ cp: 1,
168
+ ep: 1,
169
+ distributedOptimizer: true,
170
+ fsdpShardGroupSize: 256,
171
+ zeroStage: 3,
172
+ },
173
+ )
174
+
175
+ expect(hsdp.derivedParallelism.replicaGroups).toBe(4)
176
+ expect(hsdp.memoryBreakdown.totalGB).toBeLessThan(plain.memoryBreakdown.totalGB)
177
+ expect(hsdp.communication.fsdp.totalVolumePerStepGB).toBeGreaterThan(0)
178
+ })
179
+
180
+ it('models Trinity as total-parameter-heavy but active-compute-light', () => {
181
+ const analysis = analyzeCluster(
182
+ trinityLarge400B(),
183
+ {
184
+ microBatchSize: 1,
185
+ seqLength: 8192,
186
+ gradAccumSteps: 8,
187
+ precision: 'bf16',
188
+ activationCheckpointing: true,
189
+ optimizer: 'muon',
190
+ },
191
+ trinityCluster(),
192
+ {
193
+ tp: 1,
194
+ pp: 1,
195
+ cp: 1,
196
+ ep: 8,
197
+ distributedOptimizer: true,
198
+ fsdpShardGroupSize: 128,
199
+ zeroStage: 3,
200
+ },
201
+ )
202
+
203
+ expect(analysis.totalParams).toBeGreaterThan(300_000_000_000)
204
+ expect(analysis.activeParamsPerToken).toBe(13_000_000_000)
205
+ expect(analysis.communication.ep.totalVolumePerStepGB).toBeGreaterThan(0)
206
+ expect(analysis.communication.ep.usesInterNode).toBe(false)
207
+ expect(new Set(analysis.gpuMap.map((gpu) => gpu.epLane))).toEqual(
208
+ new Set([0, 1, 2, 3, 4, 5, 6, 7]),
209
+ )
210
+ })
211
+
212
+ it('derives DP for Llama 3.1 405B from world size and 4D parallelism', () => {
213
+ const analysis = analyzeCluster(
214
+ llama31_405B(),
215
+ {
216
+ microBatchSize: 1,
217
+ seqLength: 8192,
218
+ gradAccumSteps: 16,
219
+ precision: 'bf16',
220
+ activationCheckpointing: true,
221
+ optimizer: 'adamw',
222
+ },
223
+ llama405Cluster(),
224
+ {
225
+ tp: 8,
226
+ pp: 16,
227
+ cp: 1,
228
+ ep: 1,
229
+ distributedOptimizer: true,
230
+ fsdpShardGroupSize: 0,
231
+ zeroStage: 1,
232
+ },
233
+ )
234
+
235
+ expect(analysis.derivedParallelism.dp).toBe(128)
236
+ expect(analysis.feasible).toBe(true)
237
+ expect(analysis.communication.tp.totalVolumePerStepGB).toBeGreaterThan(0)
238
+ expect(analysis.communication.pp.totalVolumePerStepGB).toBeGreaterThan(0)
239
+ expect(analysis.communication.fsdp.totalVolumePerStepGB).toBe(0)
240
+ })
241
+ })
242
+
243
+ function llama405Cluster(): ClusterConfig {
244
+ return {
245
+ gpuType: h100_sxm(),
246
+ gpusPerNode: 8,
247
+ numNodes: 2048,
248
+ intraNodeBandwidthGBs: 900,
249
+ interNodeBandwidthGBs: 50,
250
+ nodesPerRack: 16,
251
+ rackLabel: 'rack',
252
+ nodeLabel: 'GPU host',
253
+ podLabel: 'rack',
254
+ }
255
+ }
256
+
257
+ function trinityCluster(): ClusterConfig {
258
+ return {
259
+ gpuType: b300(),
260
+ gpusPerNode: 8,
261
+ numNodes: 256,
262
+ intraNodeBandwidthGBs: 900,
263
+ interNodeBandwidthGBs: 50,
264
+ nodesPerRack: 9,
265
+ rackLabel: 'rack',
266
+ nodeLabel: 'GPU host',
267
+ podLabel: 'rack',
268
+ }
269
+ }
tsconfig.app.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
4
+ "target": "ES2022",
5
+ "useDefineForClassFields": true,
6
+ "lib": ["ES2022", "DOM", "DOM.Iterable"],
7
+ "module": "ESNext",
8
+ "types": ["vite/client"],
9
+ "skipLibCheck": true,
10
+
11
+ /* Bundler mode */
12
+ "moduleResolution": "bundler",
13
+ "allowImportingTsExtensions": true,
14
+ "verbatimModuleSyntax": true,
15
+ "moduleDetection": "force",
16
+ "noEmit": true,
17
+ "jsx": "react-jsx",
18
+
19
+ /* Linting */
20
+ "strict": true,
21
+ "noUnusedLocals": true,
22
+ "noUnusedParameters": true,
23
+ "erasableSyntaxOnly": true,
24
+ "noFallthroughCasesInSwitch": true,
25
+ "noUncheckedSideEffectImports": true
26
+ },
27
+ "include": ["src"]
28
+ }
tsconfig.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "references": [
4
+ { "path": "./tsconfig.app.json" },
5
+ { "path": "./tsconfig.node.json" }
6
+ ]
7
+ }
tsconfig.node.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
4
+ "target": "ES2023",
5
+ "lib": ["ES2023"],
6
+ "module": "ESNext",
7
+ "types": ["node"],
8
+ "skipLibCheck": true,
9
+
10
+ /* Bundler mode */
11
+ "moduleResolution": "bundler",
12
+ "allowImportingTsExtensions": true,
13
+ "verbatimModuleSyntax": true,
14
+ "moduleDetection": "force",
15
+ "noEmit": true,
16
+
17
+ /* Linting */
18
+ "strict": true,
19
+ "noUnusedLocals": true,
20
+ "noUnusedParameters": true,
21
+ "erasableSyntaxOnly": true,
22
+ "noFallthroughCasesInSwitch": true,
23
+ "noUncheckedSideEffectImports": true
24
+ },
25
+ "include": ["vite.config.ts"]
26
+ }
vite.config.ts ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from 'vite'
2
+ import react from '@vitejs/plugin-react'
3
+
4
+ export default defineConfig({
5
+ plugins: [react()],
6
+ server: {
7
+ host: '0.0.0.0',
8
+ port: 7860,
9
+ },
10
+ preview: {
11
+ host: '0.0.0.0',
12
+ port: 7860,
13
+ },
14
+ })
vitest.config.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from 'vitest/config'
2
+
3
+ export default defineConfig({
4
+ test: {
5
+ include: ['tests/**/*.test.ts'],
6
+ },
7
+ })