export interface ModelConfig { architecture: 'dense' | 'moe' hiddenDim: number numLayers: number numHeads: number numKVHeads: number vocabSize: number intermediateSize: number tiedEmbeddings: boolean attentionProfile?: { type: 'full' | 'hybrid' slidingWindowSize?: number globalAttentionFraction?: number globalAttentionEveryN?: number } moe?: { numExperts: number expertsPerToken: number numDenseLayers: number expertIntermediateSize: number activeParamsPerToken?: number } } export interface TrainingConfig { microBatchSize: number seqLength: number gradAccumSteps: number precision: 'fp32' | 'bf16' | 'fp16' | 'fp8' activationCheckpointing: boolean optimizer: 'adam' | 'adamw' | 'sgd' | 'muon' } export interface GPUSpec { name: string hbmCapacityGB: number peakTFLOPsBF16: number memBandwidthTBs: number } export interface ClusterConfig { gpuType: GPUSpec gpusPerNode: number numNodes: number intraNodeBandwidthGBs: number interNodeBandwidthGBs: number nodesPerRack?: number rackLabel?: string nodeLabel?: string podLabel?: string } export interface ParallelismConfig { tp: number pp: number cp: number ep: number distributedOptimizer: boolean fsdpShardGroupSize: number zeroStage: 0 | 1 | 2 | 3 } export interface ClusterAnalysis { feasible: boolean infeasibilityReason?: string totalParams: number activeParamsPerToken: number globalBatchSizeTokens: number totalGPUs: number derivedParallelism: { dp: number replicaGroups: number fsdpShardGroupSize: number fsdpGroupSize: number ep: number } memoryBreakdown: { parametersGB: number optimizerStatesGB: number gradientsGB: number activationsGB: number totalGB: number hbmCapacityGB: number utilizationPercent: number } pipelineStages: { stageIndex: number layerRange: [number, number] numLayers: number memoryGB: number hasEmbedding: boolean hasOutputHead: boolean }[] communication: { tp: { allReducesPerLayer: number messageSizeBytes: number totalVolumePerStepGB: number timePerStepMs: number linkUtilizationPercent: number } pp: { activationMessageSizeBytes: number numP2PTransfersPerStep: number totalVolumePerStepGB: number timePerStepMs: number usesInterNode: boolean } cp: { collectivesPerLayer: number messageSizeBytes: number totalVolumePerStepGB: number timePerStepMs: number linkUtilizationPercent: number usesInterNode: boolean } fsdp: { collectivesPerLayer: number messageSizeBytes: number totalVolumePerStepGB: number timePerStepMs: number linkUtilizationPercent: number usesInterNode: boolean } ep: { allToAllsPerLayer: number messageSizeBytes: number totalVolumePerStepGB: number timePerStepMs: number linkUtilizationPercent: number usesInterNode: boolean } dp: { gradientVolumePerGPU_GB: number allReduceTimeMs: number canOverlapWithBackward: boolean linkUtilizationPercent: number } } throughput: { computeTimePerStepMs: number communicationTimePerStepMs: number pipelineBubbleFraction: number pipelineBubbleTimeMs: number totalStepTimeMs: number tokensPerSecond: number mfu: number } gpuMap: { globalGPUIndex: number nodeIndex: number localGPUIndex: number tpGroup: number tpLane: number ppStage: number cpShard: number epLane: number dpReplica: number replicaGroup: number fsdpRank: number memoryUsedGB: number memoryCapacityGB: number isActive: boolean }[] links: { fromGPU: number toGPU: number type: 'nvlink' | 'infiniband' trafficType: 'tp' | 'pp' | 'cp' | 'fsdp' | 'ep' | 'dp' volumeGB: number utilizationPercent: number }[] } type LayerDistribution = { stageIndex: number startLayer: number endLayer: number numLayers: number } type StageMemory = { parametersGB: number optimizerStatesGB: number gradientsGB: number activationsGB: number totalGB: number } type StageParameterCount = { stageParams: number sharedParams: number expertParams: number denseLayers: number moeLayers: number hasEmbedding: boolean hasOutputHead: boolean } type PlacementEntry = { globalGPUIndex: number nodeIndex: number localGPUIndex: number tpGroup: number tpLane: number ppStage: number cpShard: number epLane: number dpReplica: number replicaGroup: number fsdpRank: number isActive: boolean } type DerivedParallelism = { modelParallelSize: number dp: number replicaGroups: number fsdpGroupSize: number fsdpDataParallelDegree: number } type ModelBreakdown = ReturnType type RingCommStats = { volumeBytesPerGpu: number totalVolumeBytes: number timePerStepMs: number linkUtilizationPercent: number usesInterNode: boolean } const BYTES_PER_GB = 1e9 const TP_ALL_REDUCES_PER_LAYER = 4 const CP_COLLECTIVES_PER_LAYER = 2 const FSDP_COLLECTIVES_PER_LAYER = 4 const EP_ALL_TO_ALLS_PER_LAYER = 2 const DEFAULT_BF16_EFFICIENCY = 0.56 const clamp = (value: number, min: number, max: number) => Math.min(Math.max(value, min), max) const bytesToGB = (bytes: number) => bytes / BYTES_PER_GB const round2 = (value: number) => Math.round(value * 100) / 100 const getParameterBytes = (precision: TrainingConfig['precision']) => { switch (precision) { case 'fp32': return 4 case 'fp8': return 1 default: return 2 } } const getActivationBytes = (precision: TrainingConfig['precision']) => precision === 'fp32' ? 4 : 2 const getGradientBytes = (precision: TrainingConfig['precision']) => precision === 'fp32' ? 4 : 2 const getOptimizerBytesPerParam = ( optimizer: TrainingConfig['optimizer'], precision: TrainingConfig['precision'], ) => { if (optimizer === 'sgd') { return 4 } // Muon keeps lower optimizer state than Adam-family optimizers in practice. // We model it as 8 bytes per parameter of extra state on top of bf16 weights. if (optimizer === 'muon') { return 8 } return precision === 'fp32' ? 8 : 12 } const getPeakTFLOPsForPrecision = (gpu: GPUSpec, precision: TrainingConfig['precision']) => { switch (precision) { case 'fp32': return gpu.peakTFLOPsBF16 * 0.25 case 'fp8': return gpu.peakTFLOPsBF16 * 2 default: return gpu.peakTFLOPsBF16 } } const getSustainedComputeEfficiency = (training: TrainingConfig) => { const checkpointPenalty = training.activationCheckpointing ? 0.02 : 0 const fp32Penalty = training.precision === 'fp32' ? 0.08 : 0 const moeBoost = training.optimizer === 'muon' ? 0.02 : 0 return clamp(DEFAULT_BF16_EFFICIENCY - checkpointPenalty - fp32Penalty + moeBoost, 0.3, 0.62) } const distributeLayers = (numLayers: number, pp: number): LayerDistribution[] => { const baseLayers = Math.floor(numLayers / pp) const remainder = numLayers % pp let startLayer = 0 return Array.from({ length: pp }, (_, stageIndex) => { const stageLayers = baseLayers + (stageIndex < remainder ? 1 : 0) const endLayer = startLayer + stageLayers - 1 const distribution = { stageIndex, startLayer, endLayer, numLayers: stageLayers, } startLayer += stageLayers return distribution }) } const getDefaultFabric = (gpu: GPUSpec) => { const normalizedName = gpu.name.toLowerCase() if (normalizedName.includes('gb200')) { return { intraNodeBandwidthGBs: 900, interNodeBandwidthGBs: 100, } } if (normalizedName.includes('h100')) { return { intraNodeBandwidthGBs: 450, interNodeBandwidthGBs: 100, } } return { intraNodeBandwidthGBs: 300, interNodeBandwidthGBs: 50, } } const getModelBreakdown = (model: ModelConfig) => { const headDim = model.hiddenDim / model.numHeads const embeddingParams = model.vocabSize * model.hiddenDim const kvProjectionDim = model.numKVHeads * headDim const perLayerAttentionParams = model.hiddenDim * (model.hiddenDim + 2 * kvProjectionDim + model.hiddenDim) const perLayerDenseMlpParams = model.hiddenDim * model.intermediateSize * 3 const perLayerNormParams = model.hiddenDim * 2 const finalNormParams = model.hiddenDim const outputHeadParams = model.tiedEmbeddings ? 0 : embeddingParams const perExpertParams = model.architecture === 'moe' && model.moe ? model.hiddenDim * model.moe.expertIntermediateSize * 3 : 0 const totalExpertParamsPerLayer = model.architecture === 'moe' && model.moe ? perExpertParams * model.moe.numExperts : 0 const denseLayerCount = model.architecture === 'moe' && model.moe ? model.moe.numDenseLayers : model.numLayers const moeLayerCount = model.numLayers - denseLayerCount const sharedDenseLayerParams = perLayerAttentionParams + perLayerDenseMlpParams + perLayerNormParams const sharedMoeLayerParams = perLayerAttentionParams + perLayerNormParams const sharedParams = embeddingParams + denseLayerCount * sharedDenseLayerParams + moeLayerCount * sharedMoeLayerParams + finalNormParams + outputHeadParams const totalParams = sharedParams + moeLayerCount * totalExpertParamsPerLayer const derivedActiveParams = model.architecture === 'moe' && model.moe ? embeddingParams + denseLayerCount * sharedDenseLayerParams + moeLayerCount * (sharedMoeLayerParams + model.moe.expertsPerToken * perExpertParams) + finalNormParams + outputHeadParams : totalParams const activeParamsPerToken = model.architecture === 'moe' && model.moe?.activeParamsPerToken != null ? model.moe.activeParamsPerToken : derivedActiveParams const perLayerTotalParams = model.architecture === 'moe' ? sharedMoeLayerParams + totalExpertParamsPerLayer : sharedDenseLayerParams return { headDim, kvProjectionDim, embeddingParams, perLayerAttentionParams, perLayerDenseMlpParams, perLayerNormParams, perExpertParams, totalExpertParamsPerLayer, sharedDenseLayerParams, sharedMoeLayerParams, denseLayerCount, moeLayerCount, sharedParams, perLayerTotalParams, finalNormParams, outputHeadParams, totalParams, activeParamsPerToken, } } const getConcurrentMicroBatches = ( training: TrainingConfig, parallelism: ParallelismConfig, ) => { if (parallelism.pp <= 1) { return 1 } return Math.max(1, Math.min(training.gradAccumSteps, parallelism.pp)) } const getAttentionMultiplier = (model: ModelConfig, seqLength: number) => { const profile = model.attentionProfile if (!profile || profile.type === 'full') { return 1 } const windowMultiplier = profile.slidingWindowSize != null ? clamp(profile.slidingWindowSize / seqLength, 0, 1) : 1 const globalFraction = profile.globalAttentionFraction ?? (profile.globalAttentionEveryN != null ? 1 / profile.globalAttentionEveryN : 0.25) return clamp(globalFraction + (1 - globalFraction) * windowMultiplier, windowMultiplier, 1) } const getStageLayerMix = (stage: LayerDistribution, model: ModelConfig) => { if (model.architecture !== 'moe' || !model.moe) { return { denseLayers: stage.numLayers, moeLayers: 0, } } const denseEnd = model.moe.numDenseLayers - 1 const denseLayers = denseEnd < stage.startLayer ? 0 : Math.max(0, Math.min(stage.endLayer, denseEnd) - stage.startLayer + 1) return { denseLayers, moeLayers: stage.numLayers - denseLayers, } } const getStageParameterCount = ( stage: LayerDistribution, modelBreakdown: ModelBreakdown, parallelism: ParallelismConfig, model: ModelConfig, ): StageParameterCount => { const layerMix = getStageLayerMix(stage, model) let sharedParams = layerMix.denseLayers * modelBreakdown.sharedDenseLayerParams + layerMix.moeLayers * modelBreakdown.sharedMoeLayerParams const expertParams = layerMix.moeLayers * modelBreakdown.totalExpertParamsPerLayer const hasEmbedding = stage.stageIndex === 0 const hasOutputHead = stage.stageIndex === parallelism.pp - 1 if (hasEmbedding) { sharedParams += modelBreakdown.embeddingParams } if (hasOutputHead) { sharedParams += modelBreakdown.finalNormParams + modelBreakdown.outputHeadParams } return { stageParams: sharedParams + expertParams, sharedParams, expertParams, denseLayers: layerMix.denseLayers, moeLayers: layerMix.moeLayers, hasEmbedding, hasOutputHead, } } const getActivationMemoryBytesPerLayer = ({ model, training, parallelism, isMoeLayer, }: { model: ModelConfig training: TrainingConfig parallelism: ParallelismConfig isMoeLayer: boolean }) => { const activationBytes = getActivationBytes(training.precision) const shardedSequenceLength = training.seqLength / parallelism.cp const tokensPerShard = training.microBatchSize * shardedSequenceLength const kvHiddenDim = model.numKVHeads * (model.hiddenDim / model.numHeads) const tpSequenceShardFactor = parallelism.tp > 1 ? parallelism.tp : 1 // Sequence parallelism shards the residual stream and checkpointed layer boundaries across // the TP group. We assume TP-enabled dense training uses this Megatron-style optimization. const hiddenStateBytes = (tokensPerShard * model.hiddenDim * activationBytes) / tpSequenceShardFactor const attentionMultiplier = getAttentionMultiplier(model, training.seqLength) // Sequence-parallel CP reduces the activation footprint by the number of sequence shards. const qkvBytes = tokensPerShard * (model.hiddenDim + 2 * kvHiddenDim) * activationBytes * attentionMultiplier const denseMlpBytes = tokensPerShard * model.intermediateSize * activationBytes * 2 const moeMlpBytes = isMoeLayer && model.moe ? (tokensPerShard * model.moe.expertIntermediateSize * activationBytes * model.moe.expertsPerToken * 2) / Math.max(parallelism.ep, 1) : 0 const shardedIntermediateBytes = (qkvBytes + (isMoeLayer ? moeMlpBytes : denseMlpBytes)) / Math.max(parallelism.tp, 1) if (training.activationCheckpointing) { return hiddenStateBytes * 2 + shardedIntermediateBytes * 0.25 } return hiddenStateBytes * 6 + shardedIntermediateBytes * 2 } const getStageMemory = ( stageParams: StageParameterCount, model: ModelConfig, training: TrainingConfig, parallelism: ParallelismConfig, derivedParallelism: DerivedParallelism, ) => { const parameterBytes = getParameterBytes(training.precision) const gradientBytes = getGradientBytes(training.precision) const optimizerBytes = getOptimizerBytesPerParam(training.optimizer, training.precision) const fsdpShardFactor = parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.fsdpDataParallelDegree : 1 const distributedShardFactor = parallelism.distributedOptimizer ? derivedParallelism.dp : 1 const parameterShardFactor = parallelism.zeroStage >= 3 ? fsdpShardFactor : 1 const optimizerShardFactor = parallelism.zeroStage >= 1 ? parallelism.fsdpShardGroupSize > 1 ? fsdpShardFactor : distributedShardFactor : 1 const gradientShardFactor = parallelism.zeroStage >= 2 ? parallelism.fsdpShardGroupSize > 1 ? fsdpShardFactor : derivedParallelism.dp : 1 const sharedParamsLocal = stageParams.sharedParams / Math.max(parallelism.tp, 1) const expertParamsLocal = stageParams.expertParams / Math.max(parallelism.tp * parallelism.ep, 1) const parameterMemoryBytes = (sharedParamsLocal / parameterShardFactor + expertParamsLocal / parameterShardFactor) * parameterBytes const optimizerMemoryBytes = (sharedParamsLocal / optimizerShardFactor + expertParamsLocal / optimizerShardFactor) * optimizerBytes const gradientMemoryBytes = (sharedParamsLocal / gradientShardFactor + expertParamsLocal / gradientShardFactor) * gradientBytes const denseLayerActivationBytes = getActivationMemoryBytesPerLayer({ model, training, parallelism, isMoeLayer: false, }) const moeLayerActivationBytes = getActivationMemoryBytesPerLayer({ model, training, parallelism, isMoeLayer: true, }) const concurrentMicroBatches = getConcurrentMicroBatches(training, parallelism) let activationMemoryBytes = (denseLayerActivationBytes * stageParams.denseLayers + moeLayerActivationBytes * stageParams.moeLayers) * concurrentMicroBatches if (training.activationCheckpointing && stageParams.stageParams > 0) { activationMemoryBytes += Math.max(denseLayerActivationBytes, moeLayerActivationBytes) * 1.5 } const totalBytes = parameterMemoryBytes + optimizerMemoryBytes + gradientMemoryBytes + activationMemoryBytes return { parametersGB: bytesToGB(parameterMemoryBytes), optimizerStatesGB: bytesToGB(optimizerMemoryBytes), gradientsGB: bytesToGB(gradientMemoryBytes), activationsGB: bytesToGB(activationMemoryBytes), totalGB: bytesToGB(totalBytes), } } const getStageMemoryMap = ( model: ModelConfig, training: TrainingConfig, parallelism: ParallelismConfig, derivedParallelism: DerivedParallelism, ) => { const modelBreakdown = getModelBreakdown(model) const layerDistribution = distributeLayers(model.numLayers, parallelism.pp) const stageMemory = new Map() const stageParameters = new Map() for (const stage of layerDistribution) { const stageParameterCount = getStageParameterCount(stage, modelBreakdown, parallelism, model) stageParameters.set(stage.stageIndex, stageParameterCount) stageMemory.set( stage.stageIndex, getStageMemory(stageParameterCount, model, training, parallelism, derivedParallelism), ) } return { modelBreakdown, layerDistribution, stageMemory, stageParameters, } } const buildPlacement = ( cluster: ClusterConfig, parallelism: ParallelismConfig, derivedParallelism: DerivedParallelism, requiredGPUs: number, ) => { const totalGPUs = cluster.gpusPerNode * cluster.numNodes const placement: PlacementEntry[] = [] let nodeIndex = 0 let localGPUIndex = 0 let globalGPUIndex = 0 for (let replicaGroup = 0; replicaGroup < derivedParallelism.replicaGroups; replicaGroup += 1) { for (let fsdpRank = 0; fsdpRank < derivedParallelism.fsdpDataParallelDegree; fsdpRank += 1) { const dpReplica = replicaGroup * derivedParallelism.fsdpDataParallelDegree + fsdpRank for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) { for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { if (localGPUIndex + parallelism.ep * parallelism.tp > cluster.gpusPerNode) { nodeIndex += 1 localGPUIndex = 0 } for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { placement.push({ globalGPUIndex, nodeIndex, localGPUIndex, tpGroup: (((dpReplica * parallelism.pp + ppStage) * parallelism.cp + cpShard) * parallelism.ep) + epLane, tpLane, ppStage, cpShard, epLane, dpReplica, replicaGroup, fsdpRank, isActive: globalGPUIndex < requiredGPUs, }) globalGPUIndex += 1 localGPUIndex += 1 } } } } } } while (placement.length < totalGPUs) { if (localGPUIndex >= cluster.gpusPerNode) { nodeIndex += 1 localGPUIndex = 0 } placement.push({ globalGPUIndex, nodeIndex, localGPUIndex, tpGroup: -1, tpLane: -1, ppStage: -1, cpShard: -1, epLane: -1, dpReplica: -1, replicaGroup: -1, fsdpRank: -1, isActive: false, }) globalGPUIndex += 1 localGPUIndex += 1 } return placement } const getPlacementEntry = ( placement: PlacementEntry[], filters: Partial< Pick< PlacementEntry, 'dpReplica' | 'replicaGroup' | 'fsdpRank' | 'ppStage' | 'cpShard' | 'epLane' | 'tpLane' > >, ) => placement.find( (entry) => (filters.dpReplica == null || entry.dpReplica === filters.dpReplica) && (filters.replicaGroup == null || entry.replicaGroup === filters.replicaGroup) && (filters.fsdpRank == null || entry.fsdpRank === filters.fsdpRank) && (filters.ppStage == null || entry.ppStage === filters.ppStage) && (filters.cpShard == null || entry.cpShard === filters.cpShard) && (filters.epLane == null || entry.epLane === filters.epLane) && (filters.tpLane == null || entry.tpLane === filters.tpLane), ) const getDerivedParallelism = ( cluster: ClusterConfig, parallelism: ParallelismConfig, ): DerivedParallelism | null => { const totalGPUs = cluster.gpusPerNode * cluster.numNodes const modelParallelSize = parallelism.tp * parallelism.pp * parallelism.cp * parallelism.ep if (modelParallelSize <= 0 || totalGPUs % modelParallelSize !== 0) { return null } const dp = totalGPUs / modelParallelSize const fsdpGroupSize = parallelism.fsdpShardGroupSize > 1 ? parallelism.fsdpShardGroupSize : modelParallelSize if (fsdpGroupSize % modelParallelSize !== 0 || totalGPUs % fsdpGroupSize !== 0) { return null } return { modelParallelSize, dp, replicaGroups: totalGPUs / fsdpGroupSize, fsdpGroupSize, fsdpDataParallelDegree: fsdpGroupSize / modelParallelSize, } } const getMaxBandwidthForCollective = ( members: PlacementEntry[], cluster: ClusterConfig, ) => { if (members.length <= 1) { return { bandwidthGBs: cluster.intraNodeBandwidthGBs, usesInterNode: false, } } const nodeSet = new Set(members.map((member) => member.nodeIndex)) const usesInterNode = nodeSet.size > 1 return { bandwidthGBs: usesInterNode ? cluster.interNodeBandwidthGBs : cluster.intraNodeBandwidthGBs, usesInterNode, } } const getRingCommStats = ({ groupCount, groupWidth, messageBytes, collectiveCount, membersForBandwidth, cluster, totalStepTimeMs, }: { groupCount: number groupWidth: number messageBytes: number collectiveCount: number membersForBandwidth: PlacementEntry[] cluster: ClusterConfig totalStepTimeMs: number }): RingCommStats => { if (groupWidth <= 1 || collectiveCount <= 0 || messageBytes <= 0) { return { volumeBytesPerGpu: 0, totalVolumeBytes: 0, timePerStepMs: 0, linkUtilizationPercent: 0, usesInterNode: false, } } const ringVolumeBytes = (2 * (groupWidth - 1) * messageBytes) / groupWidth const volumeBytesPerGpu = ringVolumeBytes * collectiveCount const totalVolumeBytes = volumeBytesPerGpu * groupWidth * groupCount const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective( membersForBandwidth, cluster, ) const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000 const linkUtilizationPercent = totalStepTimeMs > 0 ? clamp( (bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (totalStepTimeMs / 1000))) * 100, 0, 100, ) : 0 return { volumeBytesPerGpu, totalVolumeBytes, timePerStepMs, linkUtilizationPercent, usesInterNode, } } export function analyzeCluster( model: ModelConfig, training: TrainingConfig, cluster: ClusterConfig, parallelism: ParallelismConfig, ): ClusterAnalysis { const totalGPUs = cluster.gpusPerNode * cluster.numNodes const derivedParallelism = getDerivedParallelism(cluster, parallelism) const globalBatchSizeTokens = training.microBatchSize * training.seqLength * training.gradAccumSteps * (derivedParallelism?.dp ?? 0) const emptyGpuMap = Array.from({ length: totalGPUs }, (_, globalGPUIndex) => ({ globalGPUIndex, nodeIndex: Math.floor(globalGPUIndex / cluster.gpusPerNode), localGPUIndex: globalGPUIndex % cluster.gpusPerNode, tpGroup: -1, tpLane: -1, ppStage: -1, cpShard: -1, epLane: -1, dpReplica: -1, replicaGroup: -1, fsdpRank: -1, memoryUsedGB: 0, memoryCapacityGB: cluster.gpuType.hbmCapacityGB, isActive: false, })) const emptyAnalysis = (): ClusterAnalysis => ({ feasible: false, infeasibilityReason: 'Invalid configuration', totalParams: 0, activeParamsPerToken: 0, globalBatchSizeTokens, totalGPUs, derivedParallelism: { dp: derivedParallelism?.dp ?? 0, replicaGroups: derivedParallelism?.replicaGroups ?? 0, fsdpShardGroupSize: parallelism.fsdpShardGroupSize, fsdpGroupSize: derivedParallelism?.fsdpGroupSize ?? 0, ep: parallelism.ep, }, memoryBreakdown: { parametersGB: 0, optimizerStatesGB: 0, gradientsGB: 0, activationsGB: 0, totalGB: 0, hbmCapacityGB: cluster.gpuType.hbmCapacityGB, utilizationPercent: 0, }, pipelineStages: [], communication: { tp: { allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER, messageSizeBytes: 0, totalVolumePerStepGB: 0, timePerStepMs: 0, linkUtilizationPercent: 0, }, pp: { activationMessageSizeBytes: 0, numP2PTransfersPerStep: 0, totalVolumePerStepGB: 0, timePerStepMs: 0, usesInterNode: false, }, cp: { collectivesPerLayer: CP_COLLECTIVES_PER_LAYER, messageSizeBytes: 0, totalVolumePerStepGB: 0, timePerStepMs: 0, linkUtilizationPercent: 0, usesInterNode: false, }, fsdp: { collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER, messageSizeBytes: 0, totalVolumePerStepGB: 0, timePerStepMs: 0, linkUtilizationPercent: 0, usesInterNode: false, }, ep: { allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER, messageSizeBytes: 0, totalVolumePerStepGB: 0, timePerStepMs: 0, linkUtilizationPercent: 0, usesInterNode: false, }, dp: { gradientVolumePerGPU_GB: 0, allReduceTimeMs: 0, canOverlapWithBackward: false, linkUtilizationPercent: 0, }, }, throughput: { computeTimePerStepMs: 0, communicationTimePerStepMs: 0, pipelineBubbleFraction: 0, pipelineBubbleTimeMs: 0, totalStepTimeMs: 0, tokensPerSecond: 0, mfu: 0, }, gpuMap: emptyGpuMap, links: [], }) if ( training.microBatchSize <= 0 || training.seqLength <= 0 || training.gradAccumSteps <= 0 || parallelism.tp <= 0 || parallelism.pp <= 0 || parallelism.cp <= 0 || parallelism.ep <= 0 ) { const analysis = emptyAnalysis() analysis.infeasibilityReason = 'Batch sizes and parallelism degrees must all be positive.' return analysis } if (parallelism.tp * parallelism.ep > cluster.gpusPerNode) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `TP × EP requires ${parallelism.tp * parallelism.ep} GPUs per node, but nodes only have ${cluster.gpusPerNode}.` return analysis } if (!derivedParallelism) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `World size ${totalGPUs} must be divisible by TP × PP × CP × EP, and the FSDP shard group must divide the cluster cleanly.` return analysis } if (model.hiddenDim % model.numHeads !== 0) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `hiddenDim ${model.hiddenDim} must divide evenly across ${model.numHeads} attention heads.` return analysis } if (model.numHeads % parallelism.tp !== 0) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `TP ${parallelism.tp} must divide the ${model.numHeads} attention heads.` return analysis } if (model.numKVHeads % parallelism.tp !== 0) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `TP ${parallelism.tp} should divide the ${model.numKVHeads} KV heads for clean GQA sharding.` return analysis } if (training.seqLength % parallelism.cp !== 0) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `CP ${parallelism.cp} must divide the sequence length ${training.seqLength}.` return analysis } if (model.architecture === 'moe' && !model.moe) { const analysis = emptyAnalysis() analysis.infeasibilityReason = 'MoE models require expert metadata.' return analysis } if (model.architecture === 'moe' && model.moe && model.moe.numExperts % parallelism.ep !== 0) { const analysis = emptyAnalysis() analysis.infeasibilityReason = `EP ${parallelism.ep} must divide the ${model.moe.numExperts} experts.` return analysis } const { modelBreakdown, layerDistribution, stageMemory, stageParameters } = getStageMemoryMap( model, training, parallelism, derivedParallelism, ) const placement = buildPlacement(cluster, parallelism, derivedParallelism, totalGPUs) const maxStageLayers = Math.max(...layerDistribution.map((stage) => stage.numLayers), 0) const pipelineStages = layerDistribution.map((stage) => { const stageMemoryBreakdown = stageMemory.get(stage.stageIndex) const stageParameterCount = stageParameters.get(stage.stageIndex) return { stageIndex: stage.stageIndex, layerRange: [stage.startLayer, stage.endLayer] as [number, number], numLayers: stage.numLayers, memoryGB: round2( (stageMemoryBreakdown?.totalGB ?? 0) * parallelism.tp * parallelism.cp * parallelism.ep * derivedParallelism.dp, ), hasEmbedding: stageParameterCount?.hasEmbedding ?? false, hasOutputHead: stageParameterCount?.hasOutputHead ?? false, } }) const worstStageIndex = pipelineStages.reduce((worstIndex, stage) => { const worstStageMemory = stageMemory.get(worstIndex)?.totalGB ?? 0 const candidateStageMemory = stageMemory.get(stage.stageIndex)?.totalGB ?? 0 return candidateStageMemory > worstStageMemory ? stage.stageIndex : worstIndex }, 0) const worstStageMemory = stageMemory.get(worstStageIndex) ?? { parametersGB: 0, optimizerStatesGB: 0, gradientsGB: 0, activationsGB: 0, totalGB: 0, } const pipelineBubbleFraction = parallelism.pp <= 1 ? 0 : (parallelism.pp - 1) / (training.gradAccumSteps + parallelism.pp - 1) const boundaryStageCount = Math.min( parallelism.pp, Math.max(0, Math.round(pipelineBubbleFraction * parallelism.pp)), ) const gpuMap = placement.map((entry) => { const stageMemoryBreakdown = entry.ppStage >= 0 ? stageMemory.get(entry.ppStage) ?? { parametersGB: 0, optimizerStatesGB: 0, gradientsGB: 0, activationsGB: 0, totalGB: 0, } : { parametersGB: 0, optimizerStatesGB: 0, gradientsGB: 0, activationsGB: 0, totalGB: 0, } const bubbleIdle = entry.ppStage >= parallelism.pp - boundaryStageCount && entry.ppStage >= 0 return { globalGPUIndex: entry.globalGPUIndex, nodeIndex: entry.nodeIndex, localGPUIndex: entry.localGPUIndex, tpGroup: entry.tpGroup, tpLane: entry.tpLane, ppStage: entry.ppStage, cpShard: entry.cpShard, epLane: entry.epLane, dpReplica: entry.dpReplica, replicaGroup: entry.replicaGroup, fsdpRank: entry.fsdpRank, memoryUsedGB: round2(entry.isActive ? stageMemoryBreakdown.totalGB : 0), memoryCapacityGB: cluster.gpuType.hbmCapacityGB, isActive: entry.isActive && !bubbleIdle, } }) const activationBytes = getActivationBytes(training.precision) const shardedSequenceLength = training.seqLength / parallelism.cp const tokensPerMicroBatchShard = training.microBatchSize * shardedSequenceLength const collectiveMessageBytes = tokensPerMicroBatchShard * model.hiddenDim * activationBytes const attentionComputeMultiplier = 0.65 + 0.35 * getAttentionMultiplier(model, training.seqLength) const activationCheckpointComputeMultiplier = training.activationCheckpointing ? 1.2 : 1 const totalFlopsPerStep = 6 * modelBreakdown.activeParamsPerToken * training.microBatchSize * training.seqLength * training.gradAccumSteps * derivedParallelism.dp * attentionComputeMultiplier * activationCheckpointComputeMultiplier const launchedGPUs = Math.max(totalGPUs, 1) const flopsPerGpuPerStep = totalFlopsPerStep / launchedGPUs const peakTFLOPs = getPeakTFLOPsForPrecision(cluster.gpuType, training.precision) const sustainedTFLOPs = peakTFLOPs * getSustainedComputeEfficiency(training) const computeTimePerStepMs = (flopsPerGpuPerStep / (sustainedTFLOPs * 1e12)) * 1000 const pipelineBubbleTimeMs = pipelineBubbleFraction >= 1 ? 0 : (computeTimePerStepMs * pipelineBubbleFraction) / (1 - pipelineBubbleFraction) const tentativeTotalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs const tpMembers = placement.filter( (entry) => entry.dpReplica === 0 && entry.ppStage === 0 && entry.cpShard === 0 && entry.epLane === 0 && entry.tpLane >= 0, ) const tpStats = getRingCommStats({ groupCount: parallelism.pp * parallelism.cp * parallelism.ep * derivedParallelism.dp, groupWidth: parallelism.tp, messageBytes: collectiveMessageBytes, collectiveCount: TP_ALL_REDUCES_PER_LAYER * maxStageLayers * training.gradAccumSteps, membersForBandwidth: tpMembers, cluster, totalStepTimeMs: tentativeTotalStepTimeMs, }) const cpMembers = placement.filter( (entry) => entry.dpReplica === 0 && entry.ppStage === 0 && entry.epLane === 0 && entry.tpLane === 0 && entry.cpShard >= 0, ) const cpStats = getRingCommStats({ groupCount: parallelism.pp * derivedParallelism.dp * parallelism.tp * parallelism.ep, groupWidth: parallelism.cp, messageBytes: collectiveMessageBytes, collectiveCount: CP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps, membersForBandwidth: cpMembers, cluster, totalStepTimeMs: tentativeTotalStepTimeMs, }) const averageSharedLayerParams = model.numLayers > 0 ? (modelBreakdown.denseLayerCount * modelBreakdown.sharedDenseLayerParams + modelBreakdown.moeLayerCount * modelBreakdown.sharedMoeLayerParams) / model.numLayers : 0 const fsdpMessageBytes = parallelism.zeroStage >= 3 && derivedParallelism.fsdpDataParallelDegree > 1 ? (averageSharedLayerParams / parallelism.tp / derivedParallelism.fsdpDataParallelDegree) * getParameterBytes(training.precision) : 0 const fsdpMembers = placement.filter( (entry) => entry.replicaGroup === 0 && entry.ppStage === 0 && entry.cpShard === 0 && entry.epLane === 0 && entry.tpLane === 0, ) const fsdpStats = getRingCommStats({ groupCount: derivedParallelism.replicaGroups * parallelism.pp * parallelism.cp * parallelism.ep * parallelism.tp, groupWidth: derivedParallelism.fsdpDataParallelDegree, messageBytes: fsdpMessageBytes, collectiveCount: FSDP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps, membersForBandwidth: fsdpMembers, cluster, totalStepTimeMs: tentativeTotalStepTimeMs, }) const epMembers = placement.filter( (entry) => entry.dpReplica === 0 && entry.ppStage === 0 && entry.cpShard === 0 && entry.tpLane === 0 && entry.epLane >= 0, ) const moeLayerCount = modelBreakdown.moeLayerCount const epMessageBytes = model.architecture === 'moe' && model.moe ? tokensPerMicroBatchShard * model.hiddenDim * activationBytes * model.moe.expertsPerToken : 0 const epTransferCount = EP_ALL_TO_ALLS_PER_LAYER * moeLayerCount * training.gradAccumSteps const epStats = (() => { if (parallelism.ep <= 1 || epTransferCount <= 0 || epMessageBytes <= 0) { return { totalVolumeBytes: 0, timePerStepMs: 0, linkUtilizationPercent: 0, usesInterNode: false, } } const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(epMembers, cluster) const volumeBytesPerGpu = epMessageBytes * epTransferCount * 2 const totalVolumeBytes = volumeBytesPerGpu * parallelism.ep * parallelism.pp * parallelism.cp * parallelism.tp * derivedParallelism.dp const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000 const linkUtilizationPercent = tentativeTotalStepTimeMs > 0 ? clamp( (bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (tentativeTotalStepTimeMs / 1000))) * 100, 0, 100, ) : 0 return { totalVolumeBytes, timePerStepMs, linkUtilizationPercent, usesInterNode, } })() let ppTotalVolumeBytes = 0 let ppTimePerStepMs = 0 let ppUsesInterNode = false for (let dpReplica = 0; dpReplica < derivedParallelism.dp; dpReplica += 1) { for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { for (let stageIndex = 0; stageIndex < parallelism.pp - 1; stageIndex += 1) { const source = getPlacementEntry(placement, { dpReplica, ppStage: stageIndex, cpShard, epLane: 0, tpLane: 0, }) const target = getPlacementEntry(placement, { dpReplica, ppStage: stageIndex + 1, cpShard, epLane: 0, tpLane: 0, }) if (!source || !target) { continue } const usesInterNode = source.nodeIndex !== target.nodeIndex const bandwidthGBs = usesInterNode ? cluster.interNodeBandwidthGBs : cluster.intraNodeBandwidthGBs const perLaneBytes = collectiveMessageBytes / parallelism.tp ppUsesInterNode ||= usesInterNode ppTotalVolumeBytes += collectiveMessageBytes * 2 * training.gradAccumSteps ppTimePerStepMs += (bytesToGB(perLaneBytes) / bandwidthGBs) * 1000 * 2 * training.gradAccumSteps } } } const maxStageGradientBytes = Math.max( ...Array.from(stageMemory.values()).map((stage) => stage.gradientsGB * BYTES_PER_GB), 0, ) const dpGroupWidth = parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.replicaGroups : derivedParallelism.dp const dpMembers = parallelism.fsdpShardGroupSize > 1 ? placement.filter( (entry) => entry.fsdpRank === 0 && entry.ppStage === 0 && entry.cpShard === 0 && entry.epLane === 0 && entry.tpLane === 0, ) : placement.filter( (entry) => entry.ppStage === 0 && entry.cpShard === 0 && entry.epLane === 0 && entry.tpLane === 0, ) const gradientCommBytesPerGpu = dpGroupWidth > 1 ? (2 * (dpGroupWidth - 1) * maxStageGradientBytes) / dpGroupWidth : 0 const dpBandwidth = getMaxBandwidthForCollective(dpMembers, cluster) const dpTimeMs = dpGroupWidth > 1 ? (bytesToGB(gradientCommBytesPerGpu) / dpBandwidth.bandwidthGBs) * 1000 : 0 const canOverlapDp = dpGroupWidth > 1 && (parallelism.pp > 1 || training.gradAccumSteps > 1) const dpNonOverlappedTimeMs = dpTimeMs * (canOverlapDp ? 0.35 : 1) const communicationTimePerStepMs = tpStats.timePerStepMs + cpStats.timePerStepMs + fsdpStats.timePerStepMs + epStats.timePerStepMs + ppTimePerStepMs + dpNonOverlappedTimeMs const totalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs + communicationTimePerStepMs const tokensPerSecond = totalStepTimeMs > 0 ? globalBatchSizeTokens / (totalStepTimeMs / 1000) : 0 const mfu = tokensPerSecond > 0 ? clamp( (6 * modelBreakdown.activeParamsPerToken * attentionComputeMultiplier * tokensPerSecond) / (launchedGPUs * peakTFLOPs * 1e12), 0, 1, ) : 0 const dpLinkUtilizationPercent = dpGroupWidth > 1 && totalStepTimeMs > 0 ? clamp( (bytesToGB(gradientCommBytesPerGpu) / (dpBandwidth.bandwidthGBs * (totalStepTimeMs / 1000))) * 100, 0, 100, ) : 0 const ppPerLaneVolumeGB = parallelism.pp > 1 ? bytesToGB(collectiveMessageBytes / parallelism.tp) * 2 * training.gradAccumSteps : 0 const ppLinkUtilizationPercent = parallelism.pp > 1 && totalStepTimeMs > 0 ? clamp( (ppPerLaneVolumeGB / ((ppUsesInterNode ? cluster.interNodeBandwidthGBs : cluster.intraNodeBandwidthGBs) * (totalStepTimeMs / 1000))) * 100, 0, 100, ) : 0 const links: ClusterAnalysis['links'] = [] const visualReplicaSamples = Math.min(derivedParallelism.dp, 12) const sampledDpReplicas = Array.from({ length: visualReplicaSamples }, (_, sampleIndex) => Math.floor((sampleIndex * derivedParallelism.dp) / visualReplicaSamples), ) for (const dpReplica of sampledDpReplicas) { for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) { for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { const tpEntries = placement .filter( (entry) => entry.dpReplica === dpReplica && entry.ppStage === ppStage && entry.cpShard === cpShard && entry.epLane === epLane, ) .sort((left, right) => left.tpLane - right.tpLane) if (parallelism.tp > 1) { for (let lane = 0; lane < tpEntries.length; lane += 1) { const from = tpEntries[lane] const to = tpEntries[(lane + 1) % tpEntries.length] links.push({ fromGPU: from.globalGPUIndex, toGPU: to.globalGPUIndex, type: 'nvlink', trafficType: 'tp', volumeGB: round2(bytesToGB(tpStats.volumeBytesPerGpu)), utilizationPercent: round2(tpStats.linkUtilizationPercent), }) } } if (ppStage < parallelism.pp - 1) { const nextTpEntries = placement .filter( (entry) => entry.dpReplica === dpReplica && entry.ppStage === ppStage + 1 && entry.cpShard === cpShard && entry.epLane === epLane, ) .sort((left, right) => left.tpLane - right.tpLane) for (let lane = 0; lane < Math.min(tpEntries.length, nextTpEntries.length); lane += 1) { const from = tpEntries[lane] const to = nextTpEntries[lane] links.push({ fromGPU: from.globalGPUIndex, toGPU: to.globalGPUIndex, type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', trafficType: 'pp', volumeGB: round2(ppPerLaneVolumeGB), utilizationPercent: round2(ppLinkUtilizationPercent), }) } } } } if (parallelism.cp > 1) { for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { const cpEntries = placement .filter( (entry) => entry.dpReplica === dpReplica && entry.ppStage === ppStage && entry.epLane === epLane && entry.tpLane === tpLane, ) .sort((left, right) => left.cpShard - right.cpShard) for (let shardIndex = 0; shardIndex < cpEntries.length; shardIndex += 1) { const from = cpEntries[shardIndex] const to = cpEntries[(shardIndex + 1) % cpEntries.length] links.push({ fromGPU: from.globalGPUIndex, toGPU: to.globalGPUIndex, type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', trafficType: 'cp', volumeGB: round2(bytesToGB(cpStats.volumeBytesPerGpu)), utilizationPercent: round2(cpStats.linkUtilizationPercent), }) } } } } if (parallelism.ep > 1) { for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { const epEntries = placement .filter( (entry) => entry.dpReplica === dpReplica && entry.ppStage === ppStage && entry.cpShard === cpShard && entry.tpLane === tpLane, ) .sort((left, right) => left.epLane - right.epLane) for (let lane = 0; lane < epEntries.length; lane += 1) { const from = epEntries[lane] const to = epEntries[(lane + 1) % epEntries.length] links.push({ fromGPU: from.globalGPUIndex, toGPU: to.globalGPUIndex, type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', trafficType: 'ep', volumeGB: round2( epStats.totalVolumeBytes > 0 ? bytesToGB(epStats.totalVolumeBytes) / (parallelism.ep * Math.max(parallelism.tp * parallelism.cp * parallelism.pp * derivedParallelism.dp, 1)) : 0, ), utilizationPercent: round2(epStats.linkUtilizationPercent), }) } } } } if (derivedParallelism.fsdpDataParallelDegree > 1) { for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { const fsdpEntries = placement .filter( (entry) => entry.replicaGroup === placement.find((item) => item.dpReplica === dpReplica)?.replicaGroup && entry.ppStage === ppStage && entry.cpShard === cpShard && entry.epLane === epLane && entry.tpLane === tpLane, ) .sort((left, right) => left.fsdpRank - right.fsdpRank) for (let rank = 0; rank < fsdpEntries.length; rank += 1) { const from = fsdpEntries[rank] const to = fsdpEntries[(rank + 1) % fsdpEntries.length] links.push({ fromGPU: from.globalGPUIndex, toGPU: to.globalGPUIndex, type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', trafficType: 'fsdp', volumeGB: round2(bytesToGB(fsdpStats.volumeBytesPerGpu)), utilizationPercent: round2(fsdpStats.linkUtilizationPercent), }) } } } } } if (dpGroupWidth > 1) { for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { const current = placement.find((entry) => entry.dpReplica === dpReplica) if (!current) { continue } const from = getPlacementEntry(placement, { replicaGroup: parallelism.fsdpShardGroupSize > 1 ? current.replicaGroup : undefined, fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined, dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : dpReplica, ppStage, cpShard, epLane, tpLane, }) const to = getPlacementEntry(placement, { replicaGroup: parallelism.fsdpShardGroupSize > 1 ? (current.replicaGroup + 1) % derivedParallelism.replicaGroups : undefined, fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined, dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : (dpReplica + 1) % derivedParallelism.dp, ppStage, cpShard, epLane, tpLane, }) if (!from || !to) { continue } links.push({ fromGPU: from.globalGPUIndex, toGPU: to.globalGPUIndex, type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', trafficType: 'dp', volumeGB: round2(bytesToGB(gradientCommBytesPerGpu)), utilizationPercent: round2(dpLinkUtilizationPercent), }) } } } } } } const feasible = worstStageMemory.totalGB <= cluster.gpuType.hbmCapacityGB const infeasibilityReason = feasible ? undefined : `Stage ${worstStageIndex} uses ${round2(worstStageMemory.totalGB)} GB per GPU, exceeding ${cluster.gpuType.hbmCapacityGB} GB of HBM.` return { feasible, infeasibilityReason, totalParams: Math.round(modelBreakdown.totalParams), activeParamsPerToken: Math.round(modelBreakdown.activeParamsPerToken), globalBatchSizeTokens, totalGPUs, derivedParallelism: { dp: derivedParallelism.dp, replicaGroups: derivedParallelism.replicaGroups, fsdpShardGroupSize: parallelism.fsdpShardGroupSize, fsdpGroupSize: derivedParallelism.fsdpGroupSize, ep: parallelism.ep, }, memoryBreakdown: { parametersGB: round2(worstStageMemory.parametersGB), optimizerStatesGB: round2(worstStageMemory.optimizerStatesGB), gradientsGB: round2(worstStageMemory.gradientsGB), activationsGB: round2(worstStageMemory.activationsGB), totalGB: round2(worstStageMemory.totalGB), hbmCapacityGB: cluster.gpuType.hbmCapacityGB, utilizationPercent: round2( (worstStageMemory.totalGB / cluster.gpuType.hbmCapacityGB) * 100, ), }, pipelineStages, communication: { tp: { allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER, messageSizeBytes: collectiveMessageBytes, totalVolumePerStepGB: round2(bytesToGB(tpStats.totalVolumeBytes)), timePerStepMs: round2(tpStats.timePerStepMs), linkUtilizationPercent: round2(tpStats.linkUtilizationPercent), }, pp: { activationMessageSizeBytes: collectiveMessageBytes, numP2PTransfersPerStep: parallelism.pp > 1 ? 2 * (parallelism.pp - 1) * training.gradAccumSteps * parallelism.cp * parallelism.tp * derivedParallelism.dp : 0, totalVolumePerStepGB: round2(bytesToGB(ppTotalVolumeBytes)), timePerStepMs: round2(ppTimePerStepMs), usesInterNode: ppUsesInterNode, }, cp: { collectivesPerLayer: CP_COLLECTIVES_PER_LAYER, messageSizeBytes: collectiveMessageBytes, totalVolumePerStepGB: round2(bytesToGB(cpStats.totalVolumeBytes)), timePerStepMs: round2(cpStats.timePerStepMs), linkUtilizationPercent: round2(cpStats.linkUtilizationPercent), usesInterNode: cpStats.usesInterNode, }, fsdp: { collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER, messageSizeBytes: round2(fsdpMessageBytes), totalVolumePerStepGB: round2(bytesToGB(fsdpStats.totalVolumeBytes)), timePerStepMs: round2(fsdpStats.timePerStepMs), linkUtilizationPercent: round2(fsdpStats.linkUtilizationPercent), usesInterNode: fsdpStats.usesInterNode, }, ep: { allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER, messageSizeBytes: round2(epMessageBytes), totalVolumePerStepGB: round2(bytesToGB(epStats.totalVolumeBytes)), timePerStepMs: round2(epStats.timePerStepMs), linkUtilizationPercent: round2(epStats.linkUtilizationPercent), usesInterNode: epStats.usesInterNode, }, dp: { gradientVolumePerGPU_GB: round2(bytesToGB(gradientCommBytesPerGpu)), allReduceTimeMs: round2(dpTimeMs), canOverlapWithBackward: canOverlapDp, linkUtilizationPercent: round2(dpLinkUtilizationPercent), }, }, throughput: { computeTimePerStepMs: round2(computeTimePerStepMs), communicationTimePerStepMs: round2(communicationTimePerStepMs), pipelineBubbleFraction: round2(pipelineBubbleFraction), pipelineBubbleTimeMs: round2(pipelineBubbleTimeMs), totalStepTimeMs: round2(totalStepTimeMs), tokensPerSecond: round2(tokensPerSecond), mfu: round2(mfu), }, gpuMap, links, } } export const llama7B = (): ModelConfig => ({ architecture: 'dense', hiddenDim: 4096, numLayers: 32, numHeads: 32, numKVHeads: 32, vocabSize: 32000, intermediateSize: 11008, tiedEmbeddings: false, attentionProfile: { type: 'full', }, }) export const llama70B = (): ModelConfig => ({ architecture: 'dense', hiddenDim: 8192, numLayers: 80, numHeads: 64, numKVHeads: 8, vocabSize: 32000, intermediateSize: 28672, tiedEmbeddings: false, attentionProfile: { type: 'full', }, }) export const llama405B = (): ModelConfig => ({ architecture: 'dense', hiddenDim: 16384, numLayers: 126, numHeads: 128, numKVHeads: 8, vocabSize: 128256, intermediateSize: 53248, tiedEmbeddings: false, attentionProfile: { type: 'full', }, }) export const olmo3_32B = (): ModelConfig => ({ architecture: 'dense', hiddenDim: 5120, numLayers: 64, numHeads: 40, numKVHeads: 8, vocabSize: 100278, intermediateSize: 27648, tiedEmbeddings: false, attentionProfile: { type: 'hybrid', slidingWindowSize: 4096, globalAttentionFraction: 0.25, }, }) export const llama31_405B = (): ModelConfig => ({ architecture: 'dense', hiddenDim: 16384, numLayers: 126, numHeads: 128, numKVHeads: 8, vocabSize: 128256, intermediateSize: 53248, tiedEmbeddings: false, attentionProfile: { type: 'full', }, }) export const trinityLarge400B = (): ModelConfig => ({ architecture: 'moe', hiddenDim: 3072, numLayers: 60, numHeads: 48, numKVHeads: 8, vocabSize: 200192, intermediateSize: 12288, tiedEmbeddings: false, attentionProfile: { type: 'hybrid', slidingWindowSize: 4096, globalAttentionEveryN: 4, }, moe: { numExperts: 256, expertsPerToken: 4, numDenseLayers: 6, expertIntermediateSize: 3072, activeParamsPerToken: 13_000_000_000, }, }) export const a100_80gb = (): GPUSpec => ({ name: 'A100 80GB', hbmCapacityGB: 80, peakTFLOPsBF16: 312, memBandwidthTBs: 2, }) export const h100_sxm = (): GPUSpec => ({ name: 'H100 SXM', hbmCapacityGB: 80, peakTFLOPsBF16: 989, memBandwidthTBs: 3.35, }) export const b300 = (): GPUSpec => ({ name: 'B300', hbmCapacityGB: 192, peakTFLOPsBF16: 2250, memBandwidthTBs: 8, }) export const gb200 = (): GPUSpec => ({ name: 'GB200', hbmCapacityGB: 192, peakTFLOPsBF16: 2250, memBandwidthTBs: 8, }) export const singleNode8GPU = (gpuType: GPUSpec = a100_80gb()): ClusterConfig => { const fabric = getDefaultFabric(gpuType) return { gpuType, gpusPerNode: 8, numNodes: 1, intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs, interNodeBandwidthGBs: fabric.interNodeBandwidthGBs, nodesPerRack: 1, rackLabel: 'node', nodeLabel: 'GPU host', podLabel: 'node', } } export const cluster64GPU = (gpuType: GPUSpec = h100_sxm()): ClusterConfig => { const fabric = getDefaultFabric(gpuType) return { gpuType, gpusPerNode: 8, numNodes: 8, intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs, interNodeBandwidthGBs: fabric.interNodeBandwidthGBs, nodesPerRack: 4, rackLabel: 'rack', nodeLabel: 'GPU host', podLabel: 'rack', } } export const frontier576GPU = (): ClusterConfig => { const gpuType = gb200() const fabric = getDefaultFabric(gpuType) return { gpuType, gpusPerNode: 8, numNodes: 72, intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs, interNodeBandwidthGBs: fabric.interNodeBandwidthGBs, nodesPerRack: 9, rackLabel: 'NVL72 rack', nodeLabel: 'compute tray', podLabel: 'rack', } }