illustrated-cluster / src /lib /trainingClusterModel.ts
joeddav's picture
Publish WIP HF Space snapshot
1f77aa7
export interface ModelConfig {
architecture: 'dense' | 'moe'
hiddenDim: number
numLayers: number
numHeads: number
numKVHeads: number
vocabSize: number
intermediateSize: number
tiedEmbeddings: boolean
attentionProfile?: {
type: 'full' | 'hybrid'
slidingWindowSize?: number
globalAttentionFraction?: number
globalAttentionEveryN?: number
}
moe?: {
numExperts: number
expertsPerToken: number
numDenseLayers: number
expertIntermediateSize: number
activeParamsPerToken?: number
}
}
export interface TrainingConfig {
microBatchSize: number
seqLength: number
gradAccumSteps: number
precision: 'fp32' | 'bf16' | 'fp16' | 'fp8'
activationCheckpointing: boolean
optimizer: 'adam' | 'adamw' | 'sgd' | 'muon'
}
export interface GPUSpec {
name: string
hbmCapacityGB: number
peakTFLOPsBF16: number
memBandwidthTBs: number
}
export interface ClusterConfig {
gpuType: GPUSpec
gpusPerNode: number
numNodes: number
intraNodeBandwidthGBs: number
interNodeBandwidthGBs: number
nodesPerRack?: number
rackLabel?: string
nodeLabel?: string
podLabel?: string
}
export interface ParallelismConfig {
tp: number
pp: number
cp: number
ep: number
distributedOptimizer: boolean
fsdpShardGroupSize: number
zeroStage: 0 | 1 | 2 | 3
}
export interface ClusterAnalysis {
feasible: boolean
infeasibilityReason?: string
totalParams: number
activeParamsPerToken: number
globalBatchSizeTokens: number
totalGPUs: number
derivedParallelism: {
dp: number
replicaGroups: number
fsdpShardGroupSize: number
fsdpGroupSize: number
ep: number
}
memoryBreakdown: {
parametersGB: number
optimizerStatesGB: number
gradientsGB: number
activationsGB: number
totalGB: number
hbmCapacityGB: number
utilizationPercent: number
}
pipelineStages: {
stageIndex: number
layerRange: [number, number]
numLayers: number
memoryGB: number
hasEmbedding: boolean
hasOutputHead: boolean
}[]
communication: {
tp: {
allReducesPerLayer: number
messageSizeBytes: number
totalVolumePerStepGB: number
timePerStepMs: number
linkUtilizationPercent: number
}
pp: {
activationMessageSizeBytes: number
numP2PTransfersPerStep: number
totalVolumePerStepGB: number
timePerStepMs: number
usesInterNode: boolean
}
cp: {
collectivesPerLayer: number
messageSizeBytes: number
totalVolumePerStepGB: number
timePerStepMs: number
linkUtilizationPercent: number
usesInterNode: boolean
}
fsdp: {
collectivesPerLayer: number
messageSizeBytes: number
totalVolumePerStepGB: number
timePerStepMs: number
linkUtilizationPercent: number
usesInterNode: boolean
}
ep: {
allToAllsPerLayer: number
messageSizeBytes: number
totalVolumePerStepGB: number
timePerStepMs: number
linkUtilizationPercent: number
usesInterNode: boolean
}
dp: {
gradientVolumePerGPU_GB: number
allReduceTimeMs: number
canOverlapWithBackward: boolean
linkUtilizationPercent: number
}
}
throughput: {
computeTimePerStepMs: number
communicationTimePerStepMs: number
pipelineBubbleFraction: number
pipelineBubbleTimeMs: number
totalStepTimeMs: number
tokensPerSecond: number
mfu: number
}
gpuMap: {
globalGPUIndex: number
nodeIndex: number
localGPUIndex: number
tpGroup: number
tpLane: number
ppStage: number
cpShard: number
epLane: number
dpReplica: number
replicaGroup: number
fsdpRank: number
memoryUsedGB: number
memoryCapacityGB: number
isActive: boolean
}[]
links: {
fromGPU: number
toGPU: number
type: 'nvlink' | 'infiniband'
trafficType: 'tp' | 'pp' | 'cp' | 'fsdp' | 'ep' | 'dp'
volumeGB: number
utilizationPercent: number
}[]
}
type LayerDistribution = {
stageIndex: number
startLayer: number
endLayer: number
numLayers: number
}
type StageMemory = {
parametersGB: number
optimizerStatesGB: number
gradientsGB: number
activationsGB: number
totalGB: number
}
type StageParameterCount = {
stageParams: number
sharedParams: number
expertParams: number
denseLayers: number
moeLayers: number
hasEmbedding: boolean
hasOutputHead: boolean
}
type PlacementEntry = {
globalGPUIndex: number
nodeIndex: number
localGPUIndex: number
tpGroup: number
tpLane: number
ppStage: number
cpShard: number
epLane: number
dpReplica: number
replicaGroup: number
fsdpRank: number
isActive: boolean
}
type DerivedParallelism = {
modelParallelSize: number
dp: number
replicaGroups: number
fsdpGroupSize: number
fsdpDataParallelDegree: number
}
type ModelBreakdown = ReturnType<typeof getModelBreakdown>
type RingCommStats = {
volumeBytesPerGpu: number
totalVolumeBytes: number
timePerStepMs: number
linkUtilizationPercent: number
usesInterNode: boolean
}
const BYTES_PER_GB = 1e9
const TP_ALL_REDUCES_PER_LAYER = 4
const CP_COLLECTIVES_PER_LAYER = 2
const FSDP_COLLECTIVES_PER_LAYER = 4
const EP_ALL_TO_ALLS_PER_LAYER = 2
const DEFAULT_BF16_EFFICIENCY = 0.56
const clamp = (value: number, min: number, max: number) =>
Math.min(Math.max(value, min), max)
const bytesToGB = (bytes: number) => bytes / BYTES_PER_GB
const round2 = (value: number) => Math.round(value * 100) / 100
const getParameterBytes = (precision: TrainingConfig['precision']) => {
switch (precision) {
case 'fp32':
return 4
case 'fp8':
return 1
default:
return 2
}
}
const getActivationBytes = (precision: TrainingConfig['precision']) =>
precision === 'fp32' ? 4 : 2
const getGradientBytes = (precision: TrainingConfig['precision']) =>
precision === 'fp32' ? 4 : 2
const getOptimizerBytesPerParam = (
optimizer: TrainingConfig['optimizer'],
precision: TrainingConfig['precision'],
) => {
if (optimizer === 'sgd') {
return 4
}
// Muon keeps lower optimizer state than Adam-family optimizers in practice.
// We model it as 8 bytes per parameter of extra state on top of bf16 weights.
if (optimizer === 'muon') {
return 8
}
return precision === 'fp32' ? 8 : 12
}
const getPeakTFLOPsForPrecision = (gpu: GPUSpec, precision: TrainingConfig['precision']) => {
switch (precision) {
case 'fp32':
return gpu.peakTFLOPsBF16 * 0.25
case 'fp8':
return gpu.peakTFLOPsBF16 * 2
default:
return gpu.peakTFLOPsBF16
}
}
const getSustainedComputeEfficiency = (training: TrainingConfig) => {
const checkpointPenalty = training.activationCheckpointing ? 0.02 : 0
const fp32Penalty = training.precision === 'fp32' ? 0.08 : 0
const moeBoost = training.optimizer === 'muon' ? 0.02 : 0
return clamp(DEFAULT_BF16_EFFICIENCY - checkpointPenalty - fp32Penalty + moeBoost, 0.3, 0.62)
}
const distributeLayers = (numLayers: number, pp: number): LayerDistribution[] => {
const baseLayers = Math.floor(numLayers / pp)
const remainder = numLayers % pp
let startLayer = 0
return Array.from({ length: pp }, (_, stageIndex) => {
const stageLayers = baseLayers + (stageIndex < remainder ? 1 : 0)
const endLayer = startLayer + stageLayers - 1
const distribution = {
stageIndex,
startLayer,
endLayer,
numLayers: stageLayers,
}
startLayer += stageLayers
return distribution
})
}
const getDefaultFabric = (gpu: GPUSpec) => {
const normalizedName = gpu.name.toLowerCase()
if (normalizedName.includes('gb200')) {
return {
intraNodeBandwidthGBs: 900,
interNodeBandwidthGBs: 100,
}
}
if (normalizedName.includes('h100')) {
return {
intraNodeBandwidthGBs: 450,
interNodeBandwidthGBs: 100,
}
}
return {
intraNodeBandwidthGBs: 300,
interNodeBandwidthGBs: 50,
}
}
const getModelBreakdown = (model: ModelConfig) => {
const headDim = model.hiddenDim / model.numHeads
const embeddingParams = model.vocabSize * model.hiddenDim
const kvProjectionDim = model.numKVHeads * headDim
const perLayerAttentionParams =
model.hiddenDim * (model.hiddenDim + 2 * kvProjectionDim + model.hiddenDim)
const perLayerDenseMlpParams = model.hiddenDim * model.intermediateSize * 3
const perLayerNormParams = model.hiddenDim * 2
const finalNormParams = model.hiddenDim
const outputHeadParams = model.tiedEmbeddings ? 0 : embeddingParams
const perExpertParams =
model.architecture === 'moe' && model.moe
? model.hiddenDim * model.moe.expertIntermediateSize * 3
: 0
const totalExpertParamsPerLayer =
model.architecture === 'moe' && model.moe ? perExpertParams * model.moe.numExperts : 0
const denseLayerCount =
model.architecture === 'moe' && model.moe ? model.moe.numDenseLayers : model.numLayers
const moeLayerCount = model.numLayers - denseLayerCount
const sharedDenseLayerParams =
perLayerAttentionParams + perLayerDenseMlpParams + perLayerNormParams
const sharedMoeLayerParams = perLayerAttentionParams + perLayerNormParams
const sharedParams =
embeddingParams +
denseLayerCount * sharedDenseLayerParams +
moeLayerCount * sharedMoeLayerParams +
finalNormParams +
outputHeadParams
const totalParams = sharedParams + moeLayerCount * totalExpertParamsPerLayer
const derivedActiveParams =
model.architecture === 'moe' && model.moe
? embeddingParams +
denseLayerCount * sharedDenseLayerParams +
moeLayerCount *
(sharedMoeLayerParams + model.moe.expertsPerToken * perExpertParams) +
finalNormParams +
outputHeadParams
: totalParams
const activeParamsPerToken =
model.architecture === 'moe' && model.moe?.activeParamsPerToken != null
? model.moe.activeParamsPerToken
: derivedActiveParams
const perLayerTotalParams =
model.architecture === 'moe'
? sharedMoeLayerParams + totalExpertParamsPerLayer
: sharedDenseLayerParams
return {
headDim,
kvProjectionDim,
embeddingParams,
perLayerAttentionParams,
perLayerDenseMlpParams,
perLayerNormParams,
perExpertParams,
totalExpertParamsPerLayer,
sharedDenseLayerParams,
sharedMoeLayerParams,
denseLayerCount,
moeLayerCount,
sharedParams,
perLayerTotalParams,
finalNormParams,
outputHeadParams,
totalParams,
activeParamsPerToken,
}
}
const getConcurrentMicroBatches = (
training: TrainingConfig,
parallelism: ParallelismConfig,
) => {
if (parallelism.pp <= 1) {
return 1
}
return Math.max(1, Math.min(training.gradAccumSteps, parallelism.pp))
}
const getAttentionMultiplier = (model: ModelConfig, seqLength: number) => {
const profile = model.attentionProfile
if (!profile || profile.type === 'full') {
return 1
}
const windowMultiplier =
profile.slidingWindowSize != null
? clamp(profile.slidingWindowSize / seqLength, 0, 1)
: 1
const globalFraction =
profile.globalAttentionFraction ??
(profile.globalAttentionEveryN != null ? 1 / profile.globalAttentionEveryN : 0.25)
return clamp(globalFraction + (1 - globalFraction) * windowMultiplier, windowMultiplier, 1)
}
const getStageLayerMix = (stage: LayerDistribution, model: ModelConfig) => {
if (model.architecture !== 'moe' || !model.moe) {
return {
denseLayers: stage.numLayers,
moeLayers: 0,
}
}
const denseEnd = model.moe.numDenseLayers - 1
const denseLayers =
denseEnd < stage.startLayer
? 0
: Math.max(0, Math.min(stage.endLayer, denseEnd) - stage.startLayer + 1)
return {
denseLayers,
moeLayers: stage.numLayers - denseLayers,
}
}
const getStageParameterCount = (
stage: LayerDistribution,
modelBreakdown: ModelBreakdown,
parallelism: ParallelismConfig,
model: ModelConfig,
): StageParameterCount => {
const layerMix = getStageLayerMix(stage, model)
let sharedParams =
layerMix.denseLayers * modelBreakdown.sharedDenseLayerParams +
layerMix.moeLayers * modelBreakdown.sharedMoeLayerParams
const expertParams = layerMix.moeLayers * modelBreakdown.totalExpertParamsPerLayer
const hasEmbedding = stage.stageIndex === 0
const hasOutputHead = stage.stageIndex === parallelism.pp - 1
if (hasEmbedding) {
sharedParams += modelBreakdown.embeddingParams
}
if (hasOutputHead) {
sharedParams += modelBreakdown.finalNormParams + modelBreakdown.outputHeadParams
}
return {
stageParams: sharedParams + expertParams,
sharedParams,
expertParams,
denseLayers: layerMix.denseLayers,
moeLayers: layerMix.moeLayers,
hasEmbedding,
hasOutputHead,
}
}
const getActivationMemoryBytesPerLayer = ({
model,
training,
parallelism,
isMoeLayer,
}: {
model: ModelConfig
training: TrainingConfig
parallelism: ParallelismConfig
isMoeLayer: boolean
}) => {
const activationBytes = getActivationBytes(training.precision)
const shardedSequenceLength = training.seqLength / parallelism.cp
const tokensPerShard = training.microBatchSize * shardedSequenceLength
const kvHiddenDim = model.numKVHeads * (model.hiddenDim / model.numHeads)
const tpSequenceShardFactor = parallelism.tp > 1 ? parallelism.tp : 1
// Sequence parallelism shards the residual stream and checkpointed layer boundaries across
// the TP group. We assume TP-enabled dense training uses this Megatron-style optimization.
const hiddenStateBytes =
(tokensPerShard * model.hiddenDim * activationBytes) / tpSequenceShardFactor
const attentionMultiplier = getAttentionMultiplier(model, training.seqLength)
// Sequence-parallel CP reduces the activation footprint by the number of sequence shards.
const qkvBytes =
tokensPerShard * (model.hiddenDim + 2 * kvHiddenDim) * activationBytes * attentionMultiplier
const denseMlpBytes = tokensPerShard * model.intermediateSize * activationBytes * 2
const moeMlpBytes =
isMoeLayer && model.moe
? (tokensPerShard *
model.moe.expertIntermediateSize *
activationBytes *
model.moe.expertsPerToken *
2) /
Math.max(parallelism.ep, 1)
: 0
const shardedIntermediateBytes =
(qkvBytes + (isMoeLayer ? moeMlpBytes : denseMlpBytes)) / Math.max(parallelism.tp, 1)
if (training.activationCheckpointing) {
return hiddenStateBytes * 2 + shardedIntermediateBytes * 0.25
}
return hiddenStateBytes * 6 + shardedIntermediateBytes * 2
}
const getStageMemory = (
stageParams: StageParameterCount,
model: ModelConfig,
training: TrainingConfig,
parallelism: ParallelismConfig,
derivedParallelism: DerivedParallelism,
) => {
const parameterBytes = getParameterBytes(training.precision)
const gradientBytes = getGradientBytes(training.precision)
const optimizerBytes = getOptimizerBytesPerParam(training.optimizer, training.precision)
const fsdpShardFactor =
parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.fsdpDataParallelDegree : 1
const distributedShardFactor = parallelism.distributedOptimizer ? derivedParallelism.dp : 1
const parameterShardFactor =
parallelism.zeroStage >= 3 ? fsdpShardFactor : 1
const optimizerShardFactor =
parallelism.zeroStage >= 1
? parallelism.fsdpShardGroupSize > 1
? fsdpShardFactor
: distributedShardFactor
: 1
const gradientShardFactor =
parallelism.zeroStage >= 2
? parallelism.fsdpShardGroupSize > 1
? fsdpShardFactor
: derivedParallelism.dp
: 1
const sharedParamsLocal = stageParams.sharedParams / Math.max(parallelism.tp, 1)
const expertParamsLocal =
stageParams.expertParams / Math.max(parallelism.tp * parallelism.ep, 1)
const parameterMemoryBytes =
(sharedParamsLocal / parameterShardFactor + expertParamsLocal / parameterShardFactor) *
parameterBytes
const optimizerMemoryBytes =
(sharedParamsLocal / optimizerShardFactor + expertParamsLocal / optimizerShardFactor) *
optimizerBytes
const gradientMemoryBytes =
(sharedParamsLocal / gradientShardFactor + expertParamsLocal / gradientShardFactor) *
gradientBytes
const denseLayerActivationBytes = getActivationMemoryBytesPerLayer({
model,
training,
parallelism,
isMoeLayer: false,
})
const moeLayerActivationBytes = getActivationMemoryBytesPerLayer({
model,
training,
parallelism,
isMoeLayer: true,
})
const concurrentMicroBatches = getConcurrentMicroBatches(training, parallelism)
let activationMemoryBytes =
(denseLayerActivationBytes * stageParams.denseLayers +
moeLayerActivationBytes * stageParams.moeLayers) *
concurrentMicroBatches
if (training.activationCheckpointing && stageParams.stageParams > 0) {
activationMemoryBytes +=
Math.max(denseLayerActivationBytes, moeLayerActivationBytes) * 1.5
}
const totalBytes =
parameterMemoryBytes + optimizerMemoryBytes + gradientMemoryBytes + activationMemoryBytes
return {
parametersGB: bytesToGB(parameterMemoryBytes),
optimizerStatesGB: bytesToGB(optimizerMemoryBytes),
gradientsGB: bytesToGB(gradientMemoryBytes),
activationsGB: bytesToGB(activationMemoryBytes),
totalGB: bytesToGB(totalBytes),
}
}
const getStageMemoryMap = (
model: ModelConfig,
training: TrainingConfig,
parallelism: ParallelismConfig,
derivedParallelism: DerivedParallelism,
) => {
const modelBreakdown = getModelBreakdown(model)
const layerDistribution = distributeLayers(model.numLayers, parallelism.pp)
const stageMemory = new Map<number, StageMemory>()
const stageParameters = new Map<number, StageParameterCount>()
for (const stage of layerDistribution) {
const stageParameterCount = getStageParameterCount(stage, modelBreakdown, parallelism, model)
stageParameters.set(stage.stageIndex, stageParameterCount)
stageMemory.set(
stage.stageIndex,
getStageMemory(stageParameterCount, model, training, parallelism, derivedParallelism),
)
}
return {
modelBreakdown,
layerDistribution,
stageMemory,
stageParameters,
}
}
const buildPlacement = (
cluster: ClusterConfig,
parallelism: ParallelismConfig,
derivedParallelism: DerivedParallelism,
requiredGPUs: number,
) => {
const totalGPUs = cluster.gpusPerNode * cluster.numNodes
const placement: PlacementEntry[] = []
let nodeIndex = 0
let localGPUIndex = 0
let globalGPUIndex = 0
for (let replicaGroup = 0; replicaGroup < derivedParallelism.replicaGroups; replicaGroup += 1) {
for (let fsdpRank = 0; fsdpRank < derivedParallelism.fsdpDataParallelDegree; fsdpRank += 1) {
const dpReplica = replicaGroup * derivedParallelism.fsdpDataParallelDegree + fsdpRank
for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
if (localGPUIndex + parallelism.ep * parallelism.tp > cluster.gpusPerNode) {
nodeIndex += 1
localGPUIndex = 0
}
for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
placement.push({
globalGPUIndex,
nodeIndex,
localGPUIndex,
tpGroup:
(((dpReplica * parallelism.pp + ppStage) * parallelism.cp + cpShard) *
parallelism.ep) +
epLane,
tpLane,
ppStage,
cpShard,
epLane,
dpReplica,
replicaGroup,
fsdpRank,
isActive: globalGPUIndex < requiredGPUs,
})
globalGPUIndex += 1
localGPUIndex += 1
}
}
}
}
}
}
while (placement.length < totalGPUs) {
if (localGPUIndex >= cluster.gpusPerNode) {
nodeIndex += 1
localGPUIndex = 0
}
placement.push({
globalGPUIndex,
nodeIndex,
localGPUIndex,
tpGroup: -1,
tpLane: -1,
ppStage: -1,
cpShard: -1,
epLane: -1,
dpReplica: -1,
replicaGroup: -1,
fsdpRank: -1,
isActive: false,
})
globalGPUIndex += 1
localGPUIndex += 1
}
return placement
}
const getPlacementEntry = (
placement: PlacementEntry[],
filters: Partial<
Pick<
PlacementEntry,
'dpReplica' | 'replicaGroup' | 'fsdpRank' | 'ppStage' | 'cpShard' | 'epLane' | 'tpLane'
>
>,
) =>
placement.find(
(entry) =>
(filters.dpReplica == null || entry.dpReplica === filters.dpReplica) &&
(filters.replicaGroup == null || entry.replicaGroup === filters.replicaGroup) &&
(filters.fsdpRank == null || entry.fsdpRank === filters.fsdpRank) &&
(filters.ppStage == null || entry.ppStage === filters.ppStage) &&
(filters.cpShard == null || entry.cpShard === filters.cpShard) &&
(filters.epLane == null || entry.epLane === filters.epLane) &&
(filters.tpLane == null || entry.tpLane === filters.tpLane),
)
const getDerivedParallelism = (
cluster: ClusterConfig,
parallelism: ParallelismConfig,
): DerivedParallelism | null => {
const totalGPUs = cluster.gpusPerNode * cluster.numNodes
const modelParallelSize =
parallelism.tp * parallelism.pp * parallelism.cp * parallelism.ep
if (modelParallelSize <= 0 || totalGPUs % modelParallelSize !== 0) {
return null
}
const dp = totalGPUs / modelParallelSize
const fsdpGroupSize =
parallelism.fsdpShardGroupSize > 1 ? parallelism.fsdpShardGroupSize : modelParallelSize
if (fsdpGroupSize % modelParallelSize !== 0 || totalGPUs % fsdpGroupSize !== 0) {
return null
}
return {
modelParallelSize,
dp,
replicaGroups: totalGPUs / fsdpGroupSize,
fsdpGroupSize,
fsdpDataParallelDegree: fsdpGroupSize / modelParallelSize,
}
}
const getMaxBandwidthForCollective = (
members: PlacementEntry[],
cluster: ClusterConfig,
) => {
if (members.length <= 1) {
return {
bandwidthGBs: cluster.intraNodeBandwidthGBs,
usesInterNode: false,
}
}
const nodeSet = new Set(members.map((member) => member.nodeIndex))
const usesInterNode = nodeSet.size > 1
return {
bandwidthGBs: usesInterNode
? cluster.interNodeBandwidthGBs
: cluster.intraNodeBandwidthGBs,
usesInterNode,
}
}
const getRingCommStats = ({
groupCount,
groupWidth,
messageBytes,
collectiveCount,
membersForBandwidth,
cluster,
totalStepTimeMs,
}: {
groupCount: number
groupWidth: number
messageBytes: number
collectiveCount: number
membersForBandwidth: PlacementEntry[]
cluster: ClusterConfig
totalStepTimeMs: number
}): RingCommStats => {
if (groupWidth <= 1 || collectiveCount <= 0 || messageBytes <= 0) {
return {
volumeBytesPerGpu: 0,
totalVolumeBytes: 0,
timePerStepMs: 0,
linkUtilizationPercent: 0,
usesInterNode: false,
}
}
const ringVolumeBytes = (2 * (groupWidth - 1) * messageBytes) / groupWidth
const volumeBytesPerGpu = ringVolumeBytes * collectiveCount
const totalVolumeBytes = volumeBytesPerGpu * groupWidth * groupCount
const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(
membersForBandwidth,
cluster,
)
const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
const linkUtilizationPercent =
totalStepTimeMs > 0
? clamp(
(bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (totalStepTimeMs / 1000))) * 100,
0,
100,
)
: 0
return {
volumeBytesPerGpu,
totalVolumeBytes,
timePerStepMs,
linkUtilizationPercent,
usesInterNode,
}
}
export function analyzeCluster(
model: ModelConfig,
training: TrainingConfig,
cluster: ClusterConfig,
parallelism: ParallelismConfig,
): ClusterAnalysis {
const totalGPUs = cluster.gpusPerNode * cluster.numNodes
const derivedParallelism = getDerivedParallelism(cluster, parallelism)
const globalBatchSizeTokens =
training.microBatchSize *
training.seqLength *
training.gradAccumSteps *
(derivedParallelism?.dp ?? 0)
const emptyGpuMap = Array.from({ length: totalGPUs }, (_, globalGPUIndex) => ({
globalGPUIndex,
nodeIndex: Math.floor(globalGPUIndex / cluster.gpusPerNode),
localGPUIndex: globalGPUIndex % cluster.gpusPerNode,
tpGroup: -1,
tpLane: -1,
ppStage: -1,
cpShard: -1,
epLane: -1,
dpReplica: -1,
replicaGroup: -1,
fsdpRank: -1,
memoryUsedGB: 0,
memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
isActive: false,
}))
const emptyAnalysis = (): ClusterAnalysis => ({
feasible: false,
infeasibilityReason: 'Invalid configuration',
totalParams: 0,
activeParamsPerToken: 0,
globalBatchSizeTokens,
totalGPUs,
derivedParallelism: {
dp: derivedParallelism?.dp ?? 0,
replicaGroups: derivedParallelism?.replicaGroups ?? 0,
fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
fsdpGroupSize: derivedParallelism?.fsdpGroupSize ?? 0,
ep: parallelism.ep,
},
memoryBreakdown: {
parametersGB: 0,
optimizerStatesGB: 0,
gradientsGB: 0,
activationsGB: 0,
totalGB: 0,
hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
utilizationPercent: 0,
},
pipelineStages: [],
communication: {
tp: {
allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
messageSizeBytes: 0,
totalVolumePerStepGB: 0,
timePerStepMs: 0,
linkUtilizationPercent: 0,
},
pp: {
activationMessageSizeBytes: 0,
numP2PTransfersPerStep: 0,
totalVolumePerStepGB: 0,
timePerStepMs: 0,
usesInterNode: false,
},
cp: {
collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
messageSizeBytes: 0,
totalVolumePerStepGB: 0,
timePerStepMs: 0,
linkUtilizationPercent: 0,
usesInterNode: false,
},
fsdp: {
collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
messageSizeBytes: 0,
totalVolumePerStepGB: 0,
timePerStepMs: 0,
linkUtilizationPercent: 0,
usesInterNode: false,
},
ep: {
allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
messageSizeBytes: 0,
totalVolumePerStepGB: 0,
timePerStepMs: 0,
linkUtilizationPercent: 0,
usesInterNode: false,
},
dp: {
gradientVolumePerGPU_GB: 0,
allReduceTimeMs: 0,
canOverlapWithBackward: false,
linkUtilizationPercent: 0,
},
},
throughput: {
computeTimePerStepMs: 0,
communicationTimePerStepMs: 0,
pipelineBubbleFraction: 0,
pipelineBubbleTimeMs: 0,
totalStepTimeMs: 0,
tokensPerSecond: 0,
mfu: 0,
},
gpuMap: emptyGpuMap,
links: [],
})
if (
training.microBatchSize <= 0 ||
training.seqLength <= 0 ||
training.gradAccumSteps <= 0 ||
parallelism.tp <= 0 ||
parallelism.pp <= 0 ||
parallelism.cp <= 0 ||
parallelism.ep <= 0
) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason = 'Batch sizes and parallelism degrees must all be positive.'
return analysis
}
if (parallelism.tp * parallelism.ep > cluster.gpusPerNode) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`TP × EP requires ${parallelism.tp * parallelism.ep} GPUs per node, but nodes only have ${cluster.gpusPerNode}.`
return analysis
}
if (!derivedParallelism) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`World size ${totalGPUs} must be divisible by TP × PP × CP × EP, and the FSDP shard group must divide the cluster cleanly.`
return analysis
}
if (model.hiddenDim % model.numHeads !== 0) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`hiddenDim ${model.hiddenDim} must divide evenly across ${model.numHeads} attention heads.`
return analysis
}
if (model.numHeads % parallelism.tp !== 0) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`TP ${parallelism.tp} must divide the ${model.numHeads} attention heads.`
return analysis
}
if (model.numKVHeads % parallelism.tp !== 0) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`TP ${parallelism.tp} should divide the ${model.numKVHeads} KV heads for clean GQA sharding.`
return analysis
}
if (training.seqLength % parallelism.cp !== 0) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`CP ${parallelism.cp} must divide the sequence length ${training.seqLength}.`
return analysis
}
if (model.architecture === 'moe' && !model.moe) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason = 'MoE models require expert metadata.'
return analysis
}
if (model.architecture === 'moe' && model.moe && model.moe.numExperts % parallelism.ep !== 0) {
const analysis = emptyAnalysis()
analysis.infeasibilityReason =
`EP ${parallelism.ep} must divide the ${model.moe.numExperts} experts.`
return analysis
}
const { modelBreakdown, layerDistribution, stageMemory, stageParameters } = getStageMemoryMap(
model,
training,
parallelism,
derivedParallelism,
)
const placement = buildPlacement(cluster, parallelism, derivedParallelism, totalGPUs)
const maxStageLayers = Math.max(...layerDistribution.map((stage) => stage.numLayers), 0)
const pipelineStages = layerDistribution.map((stage) => {
const stageMemoryBreakdown = stageMemory.get(stage.stageIndex)
const stageParameterCount = stageParameters.get(stage.stageIndex)
return {
stageIndex: stage.stageIndex,
layerRange: [stage.startLayer, stage.endLayer] as [number, number],
numLayers: stage.numLayers,
memoryGB: round2(
(stageMemoryBreakdown?.totalGB ?? 0) *
parallelism.tp *
parallelism.cp *
parallelism.ep *
derivedParallelism.dp,
),
hasEmbedding: stageParameterCount?.hasEmbedding ?? false,
hasOutputHead: stageParameterCount?.hasOutputHead ?? false,
}
})
const worstStageIndex = pipelineStages.reduce((worstIndex, stage) => {
const worstStageMemory = stageMemory.get(worstIndex)?.totalGB ?? 0
const candidateStageMemory = stageMemory.get(stage.stageIndex)?.totalGB ?? 0
return candidateStageMemory > worstStageMemory ? stage.stageIndex : worstIndex
}, 0)
const worstStageMemory = stageMemory.get(worstStageIndex) ?? {
parametersGB: 0,
optimizerStatesGB: 0,
gradientsGB: 0,
activationsGB: 0,
totalGB: 0,
}
const pipelineBubbleFraction =
parallelism.pp <= 1
? 0
: (parallelism.pp - 1) / (training.gradAccumSteps + parallelism.pp - 1)
const boundaryStageCount = Math.min(
parallelism.pp,
Math.max(0, Math.round(pipelineBubbleFraction * parallelism.pp)),
)
const gpuMap = placement.map((entry) => {
const stageMemoryBreakdown =
entry.ppStage >= 0
? stageMemory.get(entry.ppStage) ?? {
parametersGB: 0,
optimizerStatesGB: 0,
gradientsGB: 0,
activationsGB: 0,
totalGB: 0,
}
: {
parametersGB: 0,
optimizerStatesGB: 0,
gradientsGB: 0,
activationsGB: 0,
totalGB: 0,
}
const bubbleIdle = entry.ppStage >= parallelism.pp - boundaryStageCount && entry.ppStage >= 0
return {
globalGPUIndex: entry.globalGPUIndex,
nodeIndex: entry.nodeIndex,
localGPUIndex: entry.localGPUIndex,
tpGroup: entry.tpGroup,
tpLane: entry.tpLane,
ppStage: entry.ppStage,
cpShard: entry.cpShard,
epLane: entry.epLane,
dpReplica: entry.dpReplica,
replicaGroup: entry.replicaGroup,
fsdpRank: entry.fsdpRank,
memoryUsedGB: round2(entry.isActive ? stageMemoryBreakdown.totalGB : 0),
memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
isActive: entry.isActive && !bubbleIdle,
}
})
const activationBytes = getActivationBytes(training.precision)
const shardedSequenceLength = training.seqLength / parallelism.cp
const tokensPerMicroBatchShard = training.microBatchSize * shardedSequenceLength
const collectiveMessageBytes =
tokensPerMicroBatchShard * model.hiddenDim * activationBytes
const attentionComputeMultiplier = 0.65 + 0.35 * getAttentionMultiplier(model, training.seqLength)
const activationCheckpointComputeMultiplier = training.activationCheckpointing ? 1.2 : 1
const totalFlopsPerStep =
6 *
modelBreakdown.activeParamsPerToken *
training.microBatchSize *
training.seqLength *
training.gradAccumSteps *
derivedParallelism.dp *
attentionComputeMultiplier *
activationCheckpointComputeMultiplier
const launchedGPUs = Math.max(totalGPUs, 1)
const flopsPerGpuPerStep = totalFlopsPerStep / launchedGPUs
const peakTFLOPs = getPeakTFLOPsForPrecision(cluster.gpuType, training.precision)
const sustainedTFLOPs = peakTFLOPs * getSustainedComputeEfficiency(training)
const computeTimePerStepMs = (flopsPerGpuPerStep / (sustainedTFLOPs * 1e12)) * 1000
const pipelineBubbleTimeMs =
pipelineBubbleFraction >= 1
? 0
: (computeTimePerStepMs * pipelineBubbleFraction) / (1 - pipelineBubbleFraction)
const tentativeTotalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs
const tpMembers = placement.filter(
(entry) =>
entry.dpReplica === 0 &&
entry.ppStage === 0 &&
entry.cpShard === 0 &&
entry.epLane === 0 &&
entry.tpLane >= 0,
)
const tpStats = getRingCommStats({
groupCount: parallelism.pp * parallelism.cp * parallelism.ep * derivedParallelism.dp,
groupWidth: parallelism.tp,
messageBytes: collectiveMessageBytes,
collectiveCount: TP_ALL_REDUCES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
membersForBandwidth: tpMembers,
cluster,
totalStepTimeMs: tentativeTotalStepTimeMs,
})
const cpMembers = placement.filter(
(entry) =>
entry.dpReplica === 0 &&
entry.ppStage === 0 &&
entry.epLane === 0 &&
entry.tpLane === 0 &&
entry.cpShard >= 0,
)
const cpStats = getRingCommStats({
groupCount: parallelism.pp * derivedParallelism.dp * parallelism.tp * parallelism.ep,
groupWidth: parallelism.cp,
messageBytes: collectiveMessageBytes,
collectiveCount: CP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
membersForBandwidth: cpMembers,
cluster,
totalStepTimeMs: tentativeTotalStepTimeMs,
})
const averageSharedLayerParams =
model.numLayers > 0
? (modelBreakdown.denseLayerCount * modelBreakdown.sharedDenseLayerParams +
modelBreakdown.moeLayerCount * modelBreakdown.sharedMoeLayerParams) /
model.numLayers
: 0
const fsdpMessageBytes =
parallelism.zeroStage >= 3 && derivedParallelism.fsdpDataParallelDegree > 1
? (averageSharedLayerParams / parallelism.tp / derivedParallelism.fsdpDataParallelDegree) *
getParameterBytes(training.precision)
: 0
const fsdpMembers = placement.filter(
(entry) =>
entry.replicaGroup === 0 &&
entry.ppStage === 0 &&
entry.cpShard === 0 &&
entry.epLane === 0 &&
entry.tpLane === 0,
)
const fsdpStats = getRingCommStats({
groupCount:
derivedParallelism.replicaGroups *
parallelism.pp *
parallelism.cp *
parallelism.ep *
parallelism.tp,
groupWidth: derivedParallelism.fsdpDataParallelDegree,
messageBytes: fsdpMessageBytes,
collectiveCount: FSDP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
membersForBandwidth: fsdpMembers,
cluster,
totalStepTimeMs: tentativeTotalStepTimeMs,
})
const epMembers = placement.filter(
(entry) =>
entry.dpReplica === 0 &&
entry.ppStage === 0 &&
entry.cpShard === 0 &&
entry.tpLane === 0 &&
entry.epLane >= 0,
)
const moeLayerCount = modelBreakdown.moeLayerCount
const epMessageBytes =
model.architecture === 'moe' && model.moe
? tokensPerMicroBatchShard *
model.hiddenDim *
activationBytes *
model.moe.expertsPerToken
: 0
const epTransferCount = EP_ALL_TO_ALLS_PER_LAYER * moeLayerCount * training.gradAccumSteps
const epStats = (() => {
if (parallelism.ep <= 1 || epTransferCount <= 0 || epMessageBytes <= 0) {
return {
totalVolumeBytes: 0,
timePerStepMs: 0,
linkUtilizationPercent: 0,
usesInterNode: false,
}
}
const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(epMembers, cluster)
const volumeBytesPerGpu = epMessageBytes * epTransferCount * 2
const totalVolumeBytes =
volumeBytesPerGpu *
parallelism.ep *
parallelism.pp *
parallelism.cp *
parallelism.tp *
derivedParallelism.dp
const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
const linkUtilizationPercent =
tentativeTotalStepTimeMs > 0
? clamp(
(bytesToGB(volumeBytesPerGpu) /
(bandwidthGBs * (tentativeTotalStepTimeMs / 1000))) *
100,
0,
100,
)
: 0
return {
totalVolumeBytes,
timePerStepMs,
linkUtilizationPercent,
usesInterNode,
}
})()
let ppTotalVolumeBytes = 0
let ppTimePerStepMs = 0
let ppUsesInterNode = false
for (let dpReplica = 0; dpReplica < derivedParallelism.dp; dpReplica += 1) {
for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
for (let stageIndex = 0; stageIndex < parallelism.pp - 1; stageIndex += 1) {
const source = getPlacementEntry(placement, {
dpReplica,
ppStage: stageIndex,
cpShard,
epLane: 0,
tpLane: 0,
})
const target = getPlacementEntry(placement, {
dpReplica,
ppStage: stageIndex + 1,
cpShard,
epLane: 0,
tpLane: 0,
})
if (!source || !target) {
continue
}
const usesInterNode = source.nodeIndex !== target.nodeIndex
const bandwidthGBs = usesInterNode
? cluster.interNodeBandwidthGBs
: cluster.intraNodeBandwidthGBs
const perLaneBytes = collectiveMessageBytes / parallelism.tp
ppUsesInterNode ||= usesInterNode
ppTotalVolumeBytes += collectiveMessageBytes * 2 * training.gradAccumSteps
ppTimePerStepMs +=
(bytesToGB(perLaneBytes) / bandwidthGBs) * 1000 * 2 * training.gradAccumSteps
}
}
}
const maxStageGradientBytes = Math.max(
...Array.from(stageMemory.values()).map((stage) => stage.gradientsGB * BYTES_PER_GB),
0,
)
const dpGroupWidth =
parallelism.fsdpShardGroupSize > 1
? derivedParallelism.replicaGroups
: derivedParallelism.dp
const dpMembers = parallelism.fsdpShardGroupSize > 1
? placement.filter(
(entry) =>
entry.fsdpRank === 0 &&
entry.ppStage === 0 &&
entry.cpShard === 0 &&
entry.epLane === 0 &&
entry.tpLane === 0,
)
: placement.filter(
(entry) =>
entry.ppStage === 0 &&
entry.cpShard === 0 &&
entry.epLane === 0 &&
entry.tpLane === 0,
)
const gradientCommBytesPerGpu =
dpGroupWidth > 1
? (2 * (dpGroupWidth - 1) * maxStageGradientBytes) / dpGroupWidth
: 0
const dpBandwidth = getMaxBandwidthForCollective(dpMembers, cluster)
const dpTimeMs =
dpGroupWidth > 1
? (bytesToGB(gradientCommBytesPerGpu) / dpBandwidth.bandwidthGBs) * 1000
: 0
const canOverlapDp = dpGroupWidth > 1 && (parallelism.pp > 1 || training.gradAccumSteps > 1)
const dpNonOverlappedTimeMs = dpTimeMs * (canOverlapDp ? 0.35 : 1)
const communicationTimePerStepMs =
tpStats.timePerStepMs +
cpStats.timePerStepMs +
fsdpStats.timePerStepMs +
epStats.timePerStepMs +
ppTimePerStepMs +
dpNonOverlappedTimeMs
const totalStepTimeMs =
computeTimePerStepMs + pipelineBubbleTimeMs + communicationTimePerStepMs
const tokensPerSecond =
totalStepTimeMs > 0 ? globalBatchSizeTokens / (totalStepTimeMs / 1000) : 0
const mfu =
tokensPerSecond > 0
? clamp(
(6 * modelBreakdown.activeParamsPerToken * attentionComputeMultiplier * tokensPerSecond) /
(launchedGPUs * peakTFLOPs * 1e12),
0,
1,
)
: 0
const dpLinkUtilizationPercent =
dpGroupWidth > 1 && totalStepTimeMs > 0
? clamp(
(bytesToGB(gradientCommBytesPerGpu) /
(dpBandwidth.bandwidthGBs * (totalStepTimeMs / 1000))) *
100,
0,
100,
)
: 0
const ppPerLaneVolumeGB =
parallelism.pp > 1
? bytesToGB(collectiveMessageBytes / parallelism.tp) * 2 * training.gradAccumSteps
: 0
const ppLinkUtilizationPercent =
parallelism.pp > 1 && totalStepTimeMs > 0
? clamp(
(ppPerLaneVolumeGB /
((ppUsesInterNode
? cluster.interNodeBandwidthGBs
: cluster.intraNodeBandwidthGBs) *
(totalStepTimeMs / 1000))) *
100,
0,
100,
)
: 0
const links: ClusterAnalysis['links'] = []
const visualReplicaSamples = Math.min(derivedParallelism.dp, 12)
const sampledDpReplicas = Array.from({ length: visualReplicaSamples }, (_, sampleIndex) =>
Math.floor((sampleIndex * derivedParallelism.dp) / visualReplicaSamples),
)
for (const dpReplica of sampledDpReplicas) {
for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
const tpEntries = placement
.filter(
(entry) =>
entry.dpReplica === dpReplica &&
entry.ppStage === ppStage &&
entry.cpShard === cpShard &&
entry.epLane === epLane,
)
.sort((left, right) => left.tpLane - right.tpLane)
if (parallelism.tp > 1) {
for (let lane = 0; lane < tpEntries.length; lane += 1) {
const from = tpEntries[lane]
const to = tpEntries[(lane + 1) % tpEntries.length]
links.push({
fromGPU: from.globalGPUIndex,
toGPU: to.globalGPUIndex,
type: 'nvlink',
trafficType: 'tp',
volumeGB: round2(bytesToGB(tpStats.volumeBytesPerGpu)),
utilizationPercent: round2(tpStats.linkUtilizationPercent),
})
}
}
if (ppStage < parallelism.pp - 1) {
const nextTpEntries = placement
.filter(
(entry) =>
entry.dpReplica === dpReplica &&
entry.ppStage === ppStage + 1 &&
entry.cpShard === cpShard &&
entry.epLane === epLane,
)
.sort((left, right) => left.tpLane - right.tpLane)
for (let lane = 0; lane < Math.min(tpEntries.length, nextTpEntries.length); lane += 1) {
const from = tpEntries[lane]
const to = nextTpEntries[lane]
links.push({
fromGPU: from.globalGPUIndex,
toGPU: to.globalGPUIndex,
type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
trafficType: 'pp',
volumeGB: round2(ppPerLaneVolumeGB),
utilizationPercent: round2(ppLinkUtilizationPercent),
})
}
}
}
}
if (parallelism.cp > 1) {
for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
const cpEntries = placement
.filter(
(entry) =>
entry.dpReplica === dpReplica &&
entry.ppStage === ppStage &&
entry.epLane === epLane &&
entry.tpLane === tpLane,
)
.sort((left, right) => left.cpShard - right.cpShard)
for (let shardIndex = 0; shardIndex < cpEntries.length; shardIndex += 1) {
const from = cpEntries[shardIndex]
const to = cpEntries[(shardIndex + 1) % cpEntries.length]
links.push({
fromGPU: from.globalGPUIndex,
toGPU: to.globalGPUIndex,
type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
trafficType: 'cp',
volumeGB: round2(bytesToGB(cpStats.volumeBytesPerGpu)),
utilizationPercent: round2(cpStats.linkUtilizationPercent),
})
}
}
}
}
if (parallelism.ep > 1) {
for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
const epEntries = placement
.filter(
(entry) =>
entry.dpReplica === dpReplica &&
entry.ppStage === ppStage &&
entry.cpShard === cpShard &&
entry.tpLane === tpLane,
)
.sort((left, right) => left.epLane - right.epLane)
for (let lane = 0; lane < epEntries.length; lane += 1) {
const from = epEntries[lane]
const to = epEntries[(lane + 1) % epEntries.length]
links.push({
fromGPU: from.globalGPUIndex,
toGPU: to.globalGPUIndex,
type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
trafficType: 'ep',
volumeGB: round2(
epStats.totalVolumeBytes > 0
? bytesToGB(epStats.totalVolumeBytes) /
(parallelism.ep *
Math.max(parallelism.tp * parallelism.cp * parallelism.pp * derivedParallelism.dp, 1))
: 0,
),
utilizationPercent: round2(epStats.linkUtilizationPercent),
})
}
}
}
}
if (derivedParallelism.fsdpDataParallelDegree > 1) {
for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
const fsdpEntries = placement
.filter(
(entry) =>
entry.replicaGroup === placement.find((item) => item.dpReplica === dpReplica)?.replicaGroup &&
entry.ppStage === ppStage &&
entry.cpShard === cpShard &&
entry.epLane === epLane &&
entry.tpLane === tpLane,
)
.sort((left, right) => left.fsdpRank - right.fsdpRank)
for (let rank = 0; rank < fsdpEntries.length; rank += 1) {
const from = fsdpEntries[rank]
const to = fsdpEntries[(rank + 1) % fsdpEntries.length]
links.push({
fromGPU: from.globalGPUIndex,
toGPU: to.globalGPUIndex,
type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
trafficType: 'fsdp',
volumeGB: round2(bytesToGB(fsdpStats.volumeBytesPerGpu)),
utilizationPercent: round2(fsdpStats.linkUtilizationPercent),
})
}
}
}
}
}
if (dpGroupWidth > 1) {
for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
const current = placement.find((entry) => entry.dpReplica === dpReplica)
if (!current) {
continue
}
const from = getPlacementEntry(placement, {
replicaGroup:
parallelism.fsdpShardGroupSize > 1 ? current.replicaGroup : undefined,
fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : dpReplica,
ppStage,
cpShard,
epLane,
tpLane,
})
const to = getPlacementEntry(placement, {
replicaGroup:
parallelism.fsdpShardGroupSize > 1
? (current.replicaGroup + 1) % derivedParallelism.replicaGroups
: undefined,
fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
dpReplica:
parallelism.fsdpShardGroupSize > 1
? undefined
: (dpReplica + 1) % derivedParallelism.dp,
ppStage,
cpShard,
epLane,
tpLane,
})
if (!from || !to) {
continue
}
links.push({
fromGPU: from.globalGPUIndex,
toGPU: to.globalGPUIndex,
type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
trafficType: 'dp',
volumeGB: round2(bytesToGB(gradientCommBytesPerGpu)),
utilizationPercent: round2(dpLinkUtilizationPercent),
})
}
}
}
}
}
}
const feasible = worstStageMemory.totalGB <= cluster.gpuType.hbmCapacityGB
const infeasibilityReason = feasible
? undefined
: `Stage ${worstStageIndex} uses ${round2(worstStageMemory.totalGB)} GB per GPU, exceeding ${cluster.gpuType.hbmCapacityGB} GB of HBM.`
return {
feasible,
infeasibilityReason,
totalParams: Math.round(modelBreakdown.totalParams),
activeParamsPerToken: Math.round(modelBreakdown.activeParamsPerToken),
globalBatchSizeTokens,
totalGPUs,
derivedParallelism: {
dp: derivedParallelism.dp,
replicaGroups: derivedParallelism.replicaGroups,
fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
fsdpGroupSize: derivedParallelism.fsdpGroupSize,
ep: parallelism.ep,
},
memoryBreakdown: {
parametersGB: round2(worstStageMemory.parametersGB),
optimizerStatesGB: round2(worstStageMemory.optimizerStatesGB),
gradientsGB: round2(worstStageMemory.gradientsGB),
activationsGB: round2(worstStageMemory.activationsGB),
totalGB: round2(worstStageMemory.totalGB),
hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
utilizationPercent: round2(
(worstStageMemory.totalGB / cluster.gpuType.hbmCapacityGB) * 100,
),
},
pipelineStages,
communication: {
tp: {
allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
messageSizeBytes: collectiveMessageBytes,
totalVolumePerStepGB: round2(bytesToGB(tpStats.totalVolumeBytes)),
timePerStepMs: round2(tpStats.timePerStepMs),
linkUtilizationPercent: round2(tpStats.linkUtilizationPercent),
},
pp: {
activationMessageSizeBytes: collectiveMessageBytes,
numP2PTransfersPerStep:
parallelism.pp > 1
? 2 *
(parallelism.pp - 1) *
training.gradAccumSteps *
parallelism.cp *
parallelism.tp *
derivedParallelism.dp
: 0,
totalVolumePerStepGB: round2(bytesToGB(ppTotalVolumeBytes)),
timePerStepMs: round2(ppTimePerStepMs),
usesInterNode: ppUsesInterNode,
},
cp: {
collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
messageSizeBytes: collectiveMessageBytes,
totalVolumePerStepGB: round2(bytesToGB(cpStats.totalVolumeBytes)),
timePerStepMs: round2(cpStats.timePerStepMs),
linkUtilizationPercent: round2(cpStats.linkUtilizationPercent),
usesInterNode: cpStats.usesInterNode,
},
fsdp: {
collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
messageSizeBytes: round2(fsdpMessageBytes),
totalVolumePerStepGB: round2(bytesToGB(fsdpStats.totalVolumeBytes)),
timePerStepMs: round2(fsdpStats.timePerStepMs),
linkUtilizationPercent: round2(fsdpStats.linkUtilizationPercent),
usesInterNode: fsdpStats.usesInterNode,
},
ep: {
allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
messageSizeBytes: round2(epMessageBytes),
totalVolumePerStepGB: round2(bytesToGB(epStats.totalVolumeBytes)),
timePerStepMs: round2(epStats.timePerStepMs),
linkUtilizationPercent: round2(epStats.linkUtilizationPercent),
usesInterNode: epStats.usesInterNode,
},
dp: {
gradientVolumePerGPU_GB: round2(bytesToGB(gradientCommBytesPerGpu)),
allReduceTimeMs: round2(dpTimeMs),
canOverlapWithBackward: canOverlapDp,
linkUtilizationPercent: round2(dpLinkUtilizationPercent),
},
},
throughput: {
computeTimePerStepMs: round2(computeTimePerStepMs),
communicationTimePerStepMs: round2(communicationTimePerStepMs),
pipelineBubbleFraction: round2(pipelineBubbleFraction),
pipelineBubbleTimeMs: round2(pipelineBubbleTimeMs),
totalStepTimeMs: round2(totalStepTimeMs),
tokensPerSecond: round2(tokensPerSecond),
mfu: round2(mfu),
},
gpuMap,
links,
}
}
export const llama7B = (): ModelConfig => ({
architecture: 'dense',
hiddenDim: 4096,
numLayers: 32,
numHeads: 32,
numKVHeads: 32,
vocabSize: 32000,
intermediateSize: 11008,
tiedEmbeddings: false,
attentionProfile: {
type: 'full',
},
})
export const llama70B = (): ModelConfig => ({
architecture: 'dense',
hiddenDim: 8192,
numLayers: 80,
numHeads: 64,
numKVHeads: 8,
vocabSize: 32000,
intermediateSize: 28672,
tiedEmbeddings: false,
attentionProfile: {
type: 'full',
},
})
export const llama405B = (): ModelConfig => ({
architecture: 'dense',
hiddenDim: 16384,
numLayers: 126,
numHeads: 128,
numKVHeads: 8,
vocabSize: 128256,
intermediateSize: 53248,
tiedEmbeddings: false,
attentionProfile: {
type: 'full',
},
})
export const olmo3_32B = (): ModelConfig => ({
architecture: 'dense',
hiddenDim: 5120,
numLayers: 64,
numHeads: 40,
numKVHeads: 8,
vocabSize: 100278,
intermediateSize: 27648,
tiedEmbeddings: false,
attentionProfile: {
type: 'hybrid',
slidingWindowSize: 4096,
globalAttentionFraction: 0.25,
},
})
export const llama31_405B = (): ModelConfig => ({
architecture: 'dense',
hiddenDim: 16384,
numLayers: 126,
numHeads: 128,
numKVHeads: 8,
vocabSize: 128256,
intermediateSize: 53248,
tiedEmbeddings: false,
attentionProfile: {
type: 'full',
},
})
export const trinityLarge400B = (): ModelConfig => ({
architecture: 'moe',
hiddenDim: 3072,
numLayers: 60,
numHeads: 48,
numKVHeads: 8,
vocabSize: 200192,
intermediateSize: 12288,
tiedEmbeddings: false,
attentionProfile: {
type: 'hybrid',
slidingWindowSize: 4096,
globalAttentionEveryN: 4,
},
moe: {
numExperts: 256,
expertsPerToken: 4,
numDenseLayers: 6,
expertIntermediateSize: 3072,
activeParamsPerToken: 13_000_000_000,
},
})
export const a100_80gb = (): GPUSpec => ({
name: 'A100 80GB',
hbmCapacityGB: 80,
peakTFLOPsBF16: 312,
memBandwidthTBs: 2,
})
export const h100_sxm = (): GPUSpec => ({
name: 'H100 SXM',
hbmCapacityGB: 80,
peakTFLOPsBF16: 989,
memBandwidthTBs: 3.35,
})
export const b300 = (): GPUSpec => ({
name: 'B300',
hbmCapacityGB: 192,
peakTFLOPsBF16: 2250,
memBandwidthTBs: 8,
})
export const gb200 = (): GPUSpec => ({
name: 'GB200',
hbmCapacityGB: 192,
peakTFLOPsBF16: 2250,
memBandwidthTBs: 8,
})
export const singleNode8GPU = (gpuType: GPUSpec = a100_80gb()): ClusterConfig => {
const fabric = getDefaultFabric(gpuType)
return {
gpuType,
gpusPerNode: 8,
numNodes: 1,
intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
nodesPerRack: 1,
rackLabel: 'node',
nodeLabel: 'GPU host',
podLabel: 'node',
}
}
export const cluster64GPU = (gpuType: GPUSpec = h100_sxm()): ClusterConfig => {
const fabric = getDefaultFabric(gpuType)
return {
gpuType,
gpusPerNode: 8,
numNodes: 8,
intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
nodesPerRack: 4,
rackLabel: 'rack',
nodeLabel: 'GPU host',
podLabel: 'rack',
}
}
export const frontier576GPU = (): ClusterConfig => {
const gpuType = gb200()
const fabric = getDefaultFabric(gpuType)
return {
gpuType,
gpusPerNode: 8,
numNodes: 72,
intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
nodesPerRack: 9,
rackLabel: 'NVL72 rack',
nodeLabel: 'compute tray',
podLabel: 'rack',
}
}