Spaces:
Sleeping
Sleeping
| export interface ModelConfig { | |
| architecture: 'dense' | 'moe' | |
| hiddenDim: number | |
| numLayers: number | |
| numHeads: number | |
| numKVHeads: number | |
| vocabSize: number | |
| intermediateSize: number | |
| tiedEmbeddings: boolean | |
| attentionProfile?: { | |
| type: 'full' | 'hybrid' | |
| slidingWindowSize?: number | |
| globalAttentionFraction?: number | |
| globalAttentionEveryN?: number | |
| } | |
| moe?: { | |
| numExperts: number | |
| expertsPerToken: number | |
| numDenseLayers: number | |
| expertIntermediateSize: number | |
| activeParamsPerToken?: number | |
| } | |
| } | |
| export interface TrainingConfig { | |
| microBatchSize: number | |
| seqLength: number | |
| gradAccumSteps: number | |
| precision: 'fp32' | 'bf16' | 'fp16' | 'fp8' | |
| activationCheckpointing: boolean | |
| optimizer: 'adam' | 'adamw' | 'sgd' | 'muon' | |
| } | |
| export interface GPUSpec { | |
| name: string | |
| hbmCapacityGB: number | |
| peakTFLOPsBF16: number | |
| memBandwidthTBs: number | |
| } | |
| export interface ClusterConfig { | |
| gpuType: GPUSpec | |
| gpusPerNode: number | |
| numNodes: number | |
| intraNodeBandwidthGBs: number | |
| interNodeBandwidthGBs: number | |
| nodesPerRack?: number | |
| rackLabel?: string | |
| nodeLabel?: string | |
| podLabel?: string | |
| } | |
| export interface ParallelismConfig { | |
| tp: number | |
| pp: number | |
| cp: number | |
| ep: number | |
| distributedOptimizer: boolean | |
| fsdpShardGroupSize: number | |
| zeroStage: 0 | 1 | 2 | 3 | |
| } | |
| export interface ClusterAnalysis { | |
| feasible: boolean | |
| infeasibilityReason?: string | |
| totalParams: number | |
| activeParamsPerToken: number | |
| globalBatchSizeTokens: number | |
| totalGPUs: number | |
| derivedParallelism: { | |
| dp: number | |
| replicaGroups: number | |
| fsdpShardGroupSize: number | |
| fsdpGroupSize: number | |
| ep: number | |
| } | |
| memoryBreakdown: { | |
| parametersGB: number | |
| optimizerStatesGB: number | |
| gradientsGB: number | |
| activationsGB: number | |
| totalGB: number | |
| hbmCapacityGB: number | |
| utilizationPercent: number | |
| } | |
| pipelineStages: { | |
| stageIndex: number | |
| layerRange: [number, number] | |
| numLayers: number | |
| memoryGB: number | |
| hasEmbedding: boolean | |
| hasOutputHead: boolean | |
| }[] | |
| communication: { | |
| tp: { | |
| allReducesPerLayer: number | |
| messageSizeBytes: number | |
| totalVolumePerStepGB: number | |
| timePerStepMs: number | |
| linkUtilizationPercent: number | |
| } | |
| pp: { | |
| activationMessageSizeBytes: number | |
| numP2PTransfersPerStep: number | |
| totalVolumePerStepGB: number | |
| timePerStepMs: number | |
| usesInterNode: boolean | |
| } | |
| cp: { | |
| collectivesPerLayer: number | |
| messageSizeBytes: number | |
| totalVolumePerStepGB: number | |
| timePerStepMs: number | |
| linkUtilizationPercent: number | |
| usesInterNode: boolean | |
| } | |
| fsdp: { | |
| collectivesPerLayer: number | |
| messageSizeBytes: number | |
| totalVolumePerStepGB: number | |
| timePerStepMs: number | |
| linkUtilizationPercent: number | |
| usesInterNode: boolean | |
| } | |
| ep: { | |
| allToAllsPerLayer: number | |
| messageSizeBytes: number | |
| totalVolumePerStepGB: number | |
| timePerStepMs: number | |
| linkUtilizationPercent: number | |
| usesInterNode: boolean | |
| } | |
| dp: { | |
| gradientVolumePerGPU_GB: number | |
| allReduceTimeMs: number | |
| canOverlapWithBackward: boolean | |
| linkUtilizationPercent: number | |
| } | |
| } | |
| throughput: { | |
| computeTimePerStepMs: number | |
| communicationTimePerStepMs: number | |
| pipelineBubbleFraction: number | |
| pipelineBubbleTimeMs: number | |
| totalStepTimeMs: number | |
| tokensPerSecond: number | |
| mfu: number | |
| } | |
| gpuMap: { | |
| globalGPUIndex: number | |
| nodeIndex: number | |
| localGPUIndex: number | |
| tpGroup: number | |
| tpLane: number | |
| ppStage: number | |
| cpShard: number | |
| epLane: number | |
| dpReplica: number | |
| replicaGroup: number | |
| fsdpRank: number | |
| memoryUsedGB: number | |
| memoryCapacityGB: number | |
| isActive: boolean | |
| }[] | |
| links: { | |
| fromGPU: number | |
| toGPU: number | |
| type: 'nvlink' | 'infiniband' | |
| trafficType: 'tp' | 'pp' | 'cp' | 'fsdp' | 'ep' | 'dp' | |
| volumeGB: number | |
| utilizationPercent: number | |
| }[] | |
| } | |
| type LayerDistribution = { | |
| stageIndex: number | |
| startLayer: number | |
| endLayer: number | |
| numLayers: number | |
| } | |
| type StageMemory = { | |
| parametersGB: number | |
| optimizerStatesGB: number | |
| gradientsGB: number | |
| activationsGB: number | |
| totalGB: number | |
| } | |
| type StageParameterCount = { | |
| stageParams: number | |
| sharedParams: number | |
| expertParams: number | |
| denseLayers: number | |
| moeLayers: number | |
| hasEmbedding: boolean | |
| hasOutputHead: boolean | |
| } | |
| type PlacementEntry = { | |
| globalGPUIndex: number | |
| nodeIndex: number | |
| localGPUIndex: number | |
| tpGroup: number | |
| tpLane: number | |
| ppStage: number | |
| cpShard: number | |
| epLane: number | |
| dpReplica: number | |
| replicaGroup: number | |
| fsdpRank: number | |
| isActive: boolean | |
| } | |
| type DerivedParallelism = { | |
| modelParallelSize: number | |
| dp: number | |
| replicaGroups: number | |
| fsdpGroupSize: number | |
| fsdpDataParallelDegree: number | |
| } | |
| type ModelBreakdown = ReturnType<typeof getModelBreakdown> | |
| type RingCommStats = { | |
| volumeBytesPerGpu: number | |
| totalVolumeBytes: number | |
| timePerStepMs: number | |
| linkUtilizationPercent: number | |
| usesInterNode: boolean | |
| } | |
| const BYTES_PER_GB = 1e9 | |
| const TP_ALL_REDUCES_PER_LAYER = 4 | |
| const CP_COLLECTIVES_PER_LAYER = 2 | |
| const FSDP_COLLECTIVES_PER_LAYER = 4 | |
| const EP_ALL_TO_ALLS_PER_LAYER = 2 | |
| const DEFAULT_BF16_EFFICIENCY = 0.56 | |
| const clamp = (value: number, min: number, max: number) => | |
| Math.min(Math.max(value, min), max) | |
| const bytesToGB = (bytes: number) => bytes / BYTES_PER_GB | |
| const round2 = (value: number) => Math.round(value * 100) / 100 | |
| const getParameterBytes = (precision: TrainingConfig['precision']) => { | |
| switch (precision) { | |
| case 'fp32': | |
| return 4 | |
| case 'fp8': | |
| return 1 | |
| default: | |
| return 2 | |
| } | |
| } | |
| const getActivationBytes = (precision: TrainingConfig['precision']) => | |
| precision === 'fp32' ? 4 : 2 | |
| const getGradientBytes = (precision: TrainingConfig['precision']) => | |
| precision === 'fp32' ? 4 : 2 | |
| const getOptimizerBytesPerParam = ( | |
| optimizer: TrainingConfig['optimizer'], | |
| precision: TrainingConfig['precision'], | |
| ) => { | |
| if (optimizer === 'sgd') { | |
| return 4 | |
| } | |
| // Muon keeps lower optimizer state than Adam-family optimizers in practice. | |
| // We model it as 8 bytes per parameter of extra state on top of bf16 weights. | |
| if (optimizer === 'muon') { | |
| return 8 | |
| } | |
| return precision === 'fp32' ? 8 : 12 | |
| } | |
| const getPeakTFLOPsForPrecision = (gpu: GPUSpec, precision: TrainingConfig['precision']) => { | |
| switch (precision) { | |
| case 'fp32': | |
| return gpu.peakTFLOPsBF16 * 0.25 | |
| case 'fp8': | |
| return gpu.peakTFLOPsBF16 * 2 | |
| default: | |
| return gpu.peakTFLOPsBF16 | |
| } | |
| } | |
| const getSustainedComputeEfficiency = (training: TrainingConfig) => { | |
| const checkpointPenalty = training.activationCheckpointing ? 0.02 : 0 | |
| const fp32Penalty = training.precision === 'fp32' ? 0.08 : 0 | |
| const moeBoost = training.optimizer === 'muon' ? 0.02 : 0 | |
| return clamp(DEFAULT_BF16_EFFICIENCY - checkpointPenalty - fp32Penalty + moeBoost, 0.3, 0.62) | |
| } | |
| const distributeLayers = (numLayers: number, pp: number): LayerDistribution[] => { | |
| const baseLayers = Math.floor(numLayers / pp) | |
| const remainder = numLayers % pp | |
| let startLayer = 0 | |
| return Array.from({ length: pp }, (_, stageIndex) => { | |
| const stageLayers = baseLayers + (stageIndex < remainder ? 1 : 0) | |
| const endLayer = startLayer + stageLayers - 1 | |
| const distribution = { | |
| stageIndex, | |
| startLayer, | |
| endLayer, | |
| numLayers: stageLayers, | |
| } | |
| startLayer += stageLayers | |
| return distribution | |
| }) | |
| } | |
| const getDefaultFabric = (gpu: GPUSpec) => { | |
| const normalizedName = gpu.name.toLowerCase() | |
| if (normalizedName.includes('gb200')) { | |
| return { | |
| intraNodeBandwidthGBs: 900, | |
| interNodeBandwidthGBs: 100, | |
| } | |
| } | |
| if (normalizedName.includes('h100')) { | |
| return { | |
| intraNodeBandwidthGBs: 450, | |
| interNodeBandwidthGBs: 100, | |
| } | |
| } | |
| return { | |
| intraNodeBandwidthGBs: 300, | |
| interNodeBandwidthGBs: 50, | |
| } | |
| } | |
| const getModelBreakdown = (model: ModelConfig) => { | |
| const headDim = model.hiddenDim / model.numHeads | |
| const embeddingParams = model.vocabSize * model.hiddenDim | |
| const kvProjectionDim = model.numKVHeads * headDim | |
| const perLayerAttentionParams = | |
| model.hiddenDim * (model.hiddenDim + 2 * kvProjectionDim + model.hiddenDim) | |
| const perLayerDenseMlpParams = model.hiddenDim * model.intermediateSize * 3 | |
| const perLayerNormParams = model.hiddenDim * 2 | |
| const finalNormParams = model.hiddenDim | |
| const outputHeadParams = model.tiedEmbeddings ? 0 : embeddingParams | |
| const perExpertParams = | |
| model.architecture === 'moe' && model.moe | |
| ? model.hiddenDim * model.moe.expertIntermediateSize * 3 | |
| : 0 | |
| const totalExpertParamsPerLayer = | |
| model.architecture === 'moe' && model.moe ? perExpertParams * model.moe.numExperts : 0 | |
| const denseLayerCount = | |
| model.architecture === 'moe' && model.moe ? model.moe.numDenseLayers : model.numLayers | |
| const moeLayerCount = model.numLayers - denseLayerCount | |
| const sharedDenseLayerParams = | |
| perLayerAttentionParams + perLayerDenseMlpParams + perLayerNormParams | |
| const sharedMoeLayerParams = perLayerAttentionParams + perLayerNormParams | |
| const sharedParams = | |
| embeddingParams + | |
| denseLayerCount * sharedDenseLayerParams + | |
| moeLayerCount * sharedMoeLayerParams + | |
| finalNormParams + | |
| outputHeadParams | |
| const totalParams = sharedParams + moeLayerCount * totalExpertParamsPerLayer | |
| const derivedActiveParams = | |
| model.architecture === 'moe' && model.moe | |
| ? embeddingParams + | |
| denseLayerCount * sharedDenseLayerParams + | |
| moeLayerCount * | |
| (sharedMoeLayerParams + model.moe.expertsPerToken * perExpertParams) + | |
| finalNormParams + | |
| outputHeadParams | |
| : totalParams | |
| const activeParamsPerToken = | |
| model.architecture === 'moe' && model.moe?.activeParamsPerToken != null | |
| ? model.moe.activeParamsPerToken | |
| : derivedActiveParams | |
| const perLayerTotalParams = | |
| model.architecture === 'moe' | |
| ? sharedMoeLayerParams + totalExpertParamsPerLayer | |
| : sharedDenseLayerParams | |
| return { | |
| headDim, | |
| kvProjectionDim, | |
| embeddingParams, | |
| perLayerAttentionParams, | |
| perLayerDenseMlpParams, | |
| perLayerNormParams, | |
| perExpertParams, | |
| totalExpertParamsPerLayer, | |
| sharedDenseLayerParams, | |
| sharedMoeLayerParams, | |
| denseLayerCount, | |
| moeLayerCount, | |
| sharedParams, | |
| perLayerTotalParams, | |
| finalNormParams, | |
| outputHeadParams, | |
| totalParams, | |
| activeParamsPerToken, | |
| } | |
| } | |
| const getConcurrentMicroBatches = ( | |
| training: TrainingConfig, | |
| parallelism: ParallelismConfig, | |
| ) => { | |
| if (parallelism.pp <= 1) { | |
| return 1 | |
| } | |
| return Math.max(1, Math.min(training.gradAccumSteps, parallelism.pp)) | |
| } | |
| const getAttentionMultiplier = (model: ModelConfig, seqLength: number) => { | |
| const profile = model.attentionProfile | |
| if (!profile || profile.type === 'full') { | |
| return 1 | |
| } | |
| const windowMultiplier = | |
| profile.slidingWindowSize != null | |
| ? clamp(profile.slidingWindowSize / seqLength, 0, 1) | |
| : 1 | |
| const globalFraction = | |
| profile.globalAttentionFraction ?? | |
| (profile.globalAttentionEveryN != null ? 1 / profile.globalAttentionEveryN : 0.25) | |
| return clamp(globalFraction + (1 - globalFraction) * windowMultiplier, windowMultiplier, 1) | |
| } | |
| const getStageLayerMix = (stage: LayerDistribution, model: ModelConfig) => { | |
| if (model.architecture !== 'moe' || !model.moe) { | |
| return { | |
| denseLayers: stage.numLayers, | |
| moeLayers: 0, | |
| } | |
| } | |
| const denseEnd = model.moe.numDenseLayers - 1 | |
| const denseLayers = | |
| denseEnd < stage.startLayer | |
| ? 0 | |
| : Math.max(0, Math.min(stage.endLayer, denseEnd) - stage.startLayer + 1) | |
| return { | |
| denseLayers, | |
| moeLayers: stage.numLayers - denseLayers, | |
| } | |
| } | |
| const getStageParameterCount = ( | |
| stage: LayerDistribution, | |
| modelBreakdown: ModelBreakdown, | |
| parallelism: ParallelismConfig, | |
| model: ModelConfig, | |
| ): StageParameterCount => { | |
| const layerMix = getStageLayerMix(stage, model) | |
| let sharedParams = | |
| layerMix.denseLayers * modelBreakdown.sharedDenseLayerParams + | |
| layerMix.moeLayers * modelBreakdown.sharedMoeLayerParams | |
| const expertParams = layerMix.moeLayers * modelBreakdown.totalExpertParamsPerLayer | |
| const hasEmbedding = stage.stageIndex === 0 | |
| const hasOutputHead = stage.stageIndex === parallelism.pp - 1 | |
| if (hasEmbedding) { | |
| sharedParams += modelBreakdown.embeddingParams | |
| } | |
| if (hasOutputHead) { | |
| sharedParams += modelBreakdown.finalNormParams + modelBreakdown.outputHeadParams | |
| } | |
| return { | |
| stageParams: sharedParams + expertParams, | |
| sharedParams, | |
| expertParams, | |
| denseLayers: layerMix.denseLayers, | |
| moeLayers: layerMix.moeLayers, | |
| hasEmbedding, | |
| hasOutputHead, | |
| } | |
| } | |
| const getActivationMemoryBytesPerLayer = ({ | |
| model, | |
| training, | |
| parallelism, | |
| isMoeLayer, | |
| }: { | |
| model: ModelConfig | |
| training: TrainingConfig | |
| parallelism: ParallelismConfig | |
| isMoeLayer: boolean | |
| }) => { | |
| const activationBytes = getActivationBytes(training.precision) | |
| const shardedSequenceLength = training.seqLength / parallelism.cp | |
| const tokensPerShard = training.microBatchSize * shardedSequenceLength | |
| const kvHiddenDim = model.numKVHeads * (model.hiddenDim / model.numHeads) | |
| const tpSequenceShardFactor = parallelism.tp > 1 ? parallelism.tp : 1 | |
| // Sequence parallelism shards the residual stream and checkpointed layer boundaries across | |
| // the TP group. We assume TP-enabled dense training uses this Megatron-style optimization. | |
| const hiddenStateBytes = | |
| (tokensPerShard * model.hiddenDim * activationBytes) / tpSequenceShardFactor | |
| const attentionMultiplier = getAttentionMultiplier(model, training.seqLength) | |
| // Sequence-parallel CP reduces the activation footprint by the number of sequence shards. | |
| const qkvBytes = | |
| tokensPerShard * (model.hiddenDim + 2 * kvHiddenDim) * activationBytes * attentionMultiplier | |
| const denseMlpBytes = tokensPerShard * model.intermediateSize * activationBytes * 2 | |
| const moeMlpBytes = | |
| isMoeLayer && model.moe | |
| ? (tokensPerShard * | |
| model.moe.expertIntermediateSize * | |
| activationBytes * | |
| model.moe.expertsPerToken * | |
| 2) / | |
| Math.max(parallelism.ep, 1) | |
| : 0 | |
| const shardedIntermediateBytes = | |
| (qkvBytes + (isMoeLayer ? moeMlpBytes : denseMlpBytes)) / Math.max(parallelism.tp, 1) | |
| if (training.activationCheckpointing) { | |
| return hiddenStateBytes * 2 + shardedIntermediateBytes * 0.25 | |
| } | |
| return hiddenStateBytes * 6 + shardedIntermediateBytes * 2 | |
| } | |
| const getStageMemory = ( | |
| stageParams: StageParameterCount, | |
| model: ModelConfig, | |
| training: TrainingConfig, | |
| parallelism: ParallelismConfig, | |
| derivedParallelism: DerivedParallelism, | |
| ) => { | |
| const parameterBytes = getParameterBytes(training.precision) | |
| const gradientBytes = getGradientBytes(training.precision) | |
| const optimizerBytes = getOptimizerBytesPerParam(training.optimizer, training.precision) | |
| const fsdpShardFactor = | |
| parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.fsdpDataParallelDegree : 1 | |
| const distributedShardFactor = parallelism.distributedOptimizer ? derivedParallelism.dp : 1 | |
| const parameterShardFactor = | |
| parallelism.zeroStage >= 3 ? fsdpShardFactor : 1 | |
| const optimizerShardFactor = | |
| parallelism.zeroStage >= 1 | |
| ? parallelism.fsdpShardGroupSize > 1 | |
| ? fsdpShardFactor | |
| : distributedShardFactor | |
| : 1 | |
| const gradientShardFactor = | |
| parallelism.zeroStage >= 2 | |
| ? parallelism.fsdpShardGroupSize > 1 | |
| ? fsdpShardFactor | |
| : derivedParallelism.dp | |
| : 1 | |
| const sharedParamsLocal = stageParams.sharedParams / Math.max(parallelism.tp, 1) | |
| const expertParamsLocal = | |
| stageParams.expertParams / Math.max(parallelism.tp * parallelism.ep, 1) | |
| const parameterMemoryBytes = | |
| (sharedParamsLocal / parameterShardFactor + expertParamsLocal / parameterShardFactor) * | |
| parameterBytes | |
| const optimizerMemoryBytes = | |
| (sharedParamsLocal / optimizerShardFactor + expertParamsLocal / optimizerShardFactor) * | |
| optimizerBytes | |
| const gradientMemoryBytes = | |
| (sharedParamsLocal / gradientShardFactor + expertParamsLocal / gradientShardFactor) * | |
| gradientBytes | |
| const denseLayerActivationBytes = getActivationMemoryBytesPerLayer({ | |
| model, | |
| training, | |
| parallelism, | |
| isMoeLayer: false, | |
| }) | |
| const moeLayerActivationBytes = getActivationMemoryBytesPerLayer({ | |
| model, | |
| training, | |
| parallelism, | |
| isMoeLayer: true, | |
| }) | |
| const concurrentMicroBatches = getConcurrentMicroBatches(training, parallelism) | |
| let activationMemoryBytes = | |
| (denseLayerActivationBytes * stageParams.denseLayers + | |
| moeLayerActivationBytes * stageParams.moeLayers) * | |
| concurrentMicroBatches | |
| if (training.activationCheckpointing && stageParams.stageParams > 0) { | |
| activationMemoryBytes += | |
| Math.max(denseLayerActivationBytes, moeLayerActivationBytes) * 1.5 | |
| } | |
| const totalBytes = | |
| parameterMemoryBytes + optimizerMemoryBytes + gradientMemoryBytes + activationMemoryBytes | |
| return { | |
| parametersGB: bytesToGB(parameterMemoryBytes), | |
| optimizerStatesGB: bytesToGB(optimizerMemoryBytes), | |
| gradientsGB: bytesToGB(gradientMemoryBytes), | |
| activationsGB: bytesToGB(activationMemoryBytes), | |
| totalGB: bytesToGB(totalBytes), | |
| } | |
| } | |
| const getStageMemoryMap = ( | |
| model: ModelConfig, | |
| training: TrainingConfig, | |
| parallelism: ParallelismConfig, | |
| derivedParallelism: DerivedParallelism, | |
| ) => { | |
| const modelBreakdown = getModelBreakdown(model) | |
| const layerDistribution = distributeLayers(model.numLayers, parallelism.pp) | |
| const stageMemory = new Map<number, StageMemory>() | |
| const stageParameters = new Map<number, StageParameterCount>() | |
| for (const stage of layerDistribution) { | |
| const stageParameterCount = getStageParameterCount(stage, modelBreakdown, parallelism, model) | |
| stageParameters.set(stage.stageIndex, stageParameterCount) | |
| stageMemory.set( | |
| stage.stageIndex, | |
| getStageMemory(stageParameterCount, model, training, parallelism, derivedParallelism), | |
| ) | |
| } | |
| return { | |
| modelBreakdown, | |
| layerDistribution, | |
| stageMemory, | |
| stageParameters, | |
| } | |
| } | |
| const buildPlacement = ( | |
| cluster: ClusterConfig, | |
| parallelism: ParallelismConfig, | |
| derivedParallelism: DerivedParallelism, | |
| requiredGPUs: number, | |
| ) => { | |
| const totalGPUs = cluster.gpusPerNode * cluster.numNodes | |
| const placement: PlacementEntry[] = [] | |
| let nodeIndex = 0 | |
| let localGPUIndex = 0 | |
| let globalGPUIndex = 0 | |
| for (let replicaGroup = 0; replicaGroup < derivedParallelism.replicaGroups; replicaGroup += 1) { | |
| for (let fsdpRank = 0; fsdpRank < derivedParallelism.fsdpDataParallelDegree; fsdpRank += 1) { | |
| const dpReplica = replicaGroup * derivedParallelism.fsdpDataParallelDegree + fsdpRank | |
| for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) { | |
| for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { | |
| if (localGPUIndex + parallelism.ep * parallelism.tp > cluster.gpusPerNode) { | |
| nodeIndex += 1 | |
| localGPUIndex = 0 | |
| } | |
| for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { | |
| for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { | |
| placement.push({ | |
| globalGPUIndex, | |
| nodeIndex, | |
| localGPUIndex, | |
| tpGroup: | |
| (((dpReplica * parallelism.pp + ppStage) * parallelism.cp + cpShard) * | |
| parallelism.ep) + | |
| epLane, | |
| tpLane, | |
| ppStage, | |
| cpShard, | |
| epLane, | |
| dpReplica, | |
| replicaGroup, | |
| fsdpRank, | |
| isActive: globalGPUIndex < requiredGPUs, | |
| }) | |
| globalGPUIndex += 1 | |
| localGPUIndex += 1 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| while (placement.length < totalGPUs) { | |
| if (localGPUIndex >= cluster.gpusPerNode) { | |
| nodeIndex += 1 | |
| localGPUIndex = 0 | |
| } | |
| placement.push({ | |
| globalGPUIndex, | |
| nodeIndex, | |
| localGPUIndex, | |
| tpGroup: -1, | |
| tpLane: -1, | |
| ppStage: -1, | |
| cpShard: -1, | |
| epLane: -1, | |
| dpReplica: -1, | |
| replicaGroup: -1, | |
| fsdpRank: -1, | |
| isActive: false, | |
| }) | |
| globalGPUIndex += 1 | |
| localGPUIndex += 1 | |
| } | |
| return placement | |
| } | |
| const getPlacementEntry = ( | |
| placement: PlacementEntry[], | |
| filters: Partial< | |
| Pick< | |
| PlacementEntry, | |
| 'dpReplica' | 'replicaGroup' | 'fsdpRank' | 'ppStage' | 'cpShard' | 'epLane' | 'tpLane' | |
| > | |
| >, | |
| ) => | |
| placement.find( | |
| (entry) => | |
| (filters.dpReplica == null || entry.dpReplica === filters.dpReplica) && | |
| (filters.replicaGroup == null || entry.replicaGroup === filters.replicaGroup) && | |
| (filters.fsdpRank == null || entry.fsdpRank === filters.fsdpRank) && | |
| (filters.ppStage == null || entry.ppStage === filters.ppStage) && | |
| (filters.cpShard == null || entry.cpShard === filters.cpShard) && | |
| (filters.epLane == null || entry.epLane === filters.epLane) && | |
| (filters.tpLane == null || entry.tpLane === filters.tpLane), | |
| ) | |
| const getDerivedParallelism = ( | |
| cluster: ClusterConfig, | |
| parallelism: ParallelismConfig, | |
| ): DerivedParallelism | null => { | |
| const totalGPUs = cluster.gpusPerNode * cluster.numNodes | |
| const modelParallelSize = | |
| parallelism.tp * parallelism.pp * parallelism.cp * parallelism.ep | |
| if (modelParallelSize <= 0 || totalGPUs % modelParallelSize !== 0) { | |
| return null | |
| } | |
| const dp = totalGPUs / modelParallelSize | |
| const fsdpGroupSize = | |
| parallelism.fsdpShardGroupSize > 1 ? parallelism.fsdpShardGroupSize : modelParallelSize | |
| if (fsdpGroupSize % modelParallelSize !== 0 || totalGPUs % fsdpGroupSize !== 0) { | |
| return null | |
| } | |
| return { | |
| modelParallelSize, | |
| dp, | |
| replicaGroups: totalGPUs / fsdpGroupSize, | |
| fsdpGroupSize, | |
| fsdpDataParallelDegree: fsdpGroupSize / modelParallelSize, | |
| } | |
| } | |
| const getMaxBandwidthForCollective = ( | |
| members: PlacementEntry[], | |
| cluster: ClusterConfig, | |
| ) => { | |
| if (members.length <= 1) { | |
| return { | |
| bandwidthGBs: cluster.intraNodeBandwidthGBs, | |
| usesInterNode: false, | |
| } | |
| } | |
| const nodeSet = new Set(members.map((member) => member.nodeIndex)) | |
| const usesInterNode = nodeSet.size > 1 | |
| return { | |
| bandwidthGBs: usesInterNode | |
| ? cluster.interNodeBandwidthGBs | |
| : cluster.intraNodeBandwidthGBs, | |
| usesInterNode, | |
| } | |
| } | |
| const getRingCommStats = ({ | |
| groupCount, | |
| groupWidth, | |
| messageBytes, | |
| collectiveCount, | |
| membersForBandwidth, | |
| cluster, | |
| totalStepTimeMs, | |
| }: { | |
| groupCount: number | |
| groupWidth: number | |
| messageBytes: number | |
| collectiveCount: number | |
| membersForBandwidth: PlacementEntry[] | |
| cluster: ClusterConfig | |
| totalStepTimeMs: number | |
| }): RingCommStats => { | |
| if (groupWidth <= 1 || collectiveCount <= 0 || messageBytes <= 0) { | |
| return { | |
| volumeBytesPerGpu: 0, | |
| totalVolumeBytes: 0, | |
| timePerStepMs: 0, | |
| linkUtilizationPercent: 0, | |
| usesInterNode: false, | |
| } | |
| } | |
| const ringVolumeBytes = (2 * (groupWidth - 1) * messageBytes) / groupWidth | |
| const volumeBytesPerGpu = ringVolumeBytes * collectiveCount | |
| const totalVolumeBytes = volumeBytesPerGpu * groupWidth * groupCount | |
| const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective( | |
| membersForBandwidth, | |
| cluster, | |
| ) | |
| const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000 | |
| const linkUtilizationPercent = | |
| totalStepTimeMs > 0 | |
| ? clamp( | |
| (bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (totalStepTimeMs / 1000))) * 100, | |
| 0, | |
| 100, | |
| ) | |
| : 0 | |
| return { | |
| volumeBytesPerGpu, | |
| totalVolumeBytes, | |
| timePerStepMs, | |
| linkUtilizationPercent, | |
| usesInterNode, | |
| } | |
| } | |
| export function analyzeCluster( | |
| model: ModelConfig, | |
| training: TrainingConfig, | |
| cluster: ClusterConfig, | |
| parallelism: ParallelismConfig, | |
| ): ClusterAnalysis { | |
| const totalGPUs = cluster.gpusPerNode * cluster.numNodes | |
| const derivedParallelism = getDerivedParallelism(cluster, parallelism) | |
| const globalBatchSizeTokens = | |
| training.microBatchSize * | |
| training.seqLength * | |
| training.gradAccumSteps * | |
| (derivedParallelism?.dp ?? 0) | |
| const emptyGpuMap = Array.from({ length: totalGPUs }, (_, globalGPUIndex) => ({ | |
| globalGPUIndex, | |
| nodeIndex: Math.floor(globalGPUIndex / cluster.gpusPerNode), | |
| localGPUIndex: globalGPUIndex % cluster.gpusPerNode, | |
| tpGroup: -1, | |
| tpLane: -1, | |
| ppStage: -1, | |
| cpShard: -1, | |
| epLane: -1, | |
| dpReplica: -1, | |
| replicaGroup: -1, | |
| fsdpRank: -1, | |
| memoryUsedGB: 0, | |
| memoryCapacityGB: cluster.gpuType.hbmCapacityGB, | |
| isActive: false, | |
| })) | |
| const emptyAnalysis = (): ClusterAnalysis => ({ | |
| feasible: false, | |
| infeasibilityReason: 'Invalid configuration', | |
| totalParams: 0, | |
| activeParamsPerToken: 0, | |
| globalBatchSizeTokens, | |
| totalGPUs, | |
| derivedParallelism: { | |
| dp: derivedParallelism?.dp ?? 0, | |
| replicaGroups: derivedParallelism?.replicaGroups ?? 0, | |
| fsdpShardGroupSize: parallelism.fsdpShardGroupSize, | |
| fsdpGroupSize: derivedParallelism?.fsdpGroupSize ?? 0, | |
| ep: parallelism.ep, | |
| }, | |
| memoryBreakdown: { | |
| parametersGB: 0, | |
| optimizerStatesGB: 0, | |
| gradientsGB: 0, | |
| activationsGB: 0, | |
| totalGB: 0, | |
| hbmCapacityGB: cluster.gpuType.hbmCapacityGB, | |
| utilizationPercent: 0, | |
| }, | |
| pipelineStages: [], | |
| communication: { | |
| tp: { | |
| allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER, | |
| messageSizeBytes: 0, | |
| totalVolumePerStepGB: 0, | |
| timePerStepMs: 0, | |
| linkUtilizationPercent: 0, | |
| }, | |
| pp: { | |
| activationMessageSizeBytes: 0, | |
| numP2PTransfersPerStep: 0, | |
| totalVolumePerStepGB: 0, | |
| timePerStepMs: 0, | |
| usesInterNode: false, | |
| }, | |
| cp: { | |
| collectivesPerLayer: CP_COLLECTIVES_PER_LAYER, | |
| messageSizeBytes: 0, | |
| totalVolumePerStepGB: 0, | |
| timePerStepMs: 0, | |
| linkUtilizationPercent: 0, | |
| usesInterNode: false, | |
| }, | |
| fsdp: { | |
| collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER, | |
| messageSizeBytes: 0, | |
| totalVolumePerStepGB: 0, | |
| timePerStepMs: 0, | |
| linkUtilizationPercent: 0, | |
| usesInterNode: false, | |
| }, | |
| ep: { | |
| allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER, | |
| messageSizeBytes: 0, | |
| totalVolumePerStepGB: 0, | |
| timePerStepMs: 0, | |
| linkUtilizationPercent: 0, | |
| usesInterNode: false, | |
| }, | |
| dp: { | |
| gradientVolumePerGPU_GB: 0, | |
| allReduceTimeMs: 0, | |
| canOverlapWithBackward: false, | |
| linkUtilizationPercent: 0, | |
| }, | |
| }, | |
| throughput: { | |
| computeTimePerStepMs: 0, | |
| communicationTimePerStepMs: 0, | |
| pipelineBubbleFraction: 0, | |
| pipelineBubbleTimeMs: 0, | |
| totalStepTimeMs: 0, | |
| tokensPerSecond: 0, | |
| mfu: 0, | |
| }, | |
| gpuMap: emptyGpuMap, | |
| links: [], | |
| }) | |
| if ( | |
| training.microBatchSize <= 0 || | |
| training.seqLength <= 0 || | |
| training.gradAccumSteps <= 0 || | |
| parallelism.tp <= 0 || | |
| parallelism.pp <= 0 || | |
| parallelism.cp <= 0 || | |
| parallelism.ep <= 0 | |
| ) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = 'Batch sizes and parallelism degrees must all be positive.' | |
| return analysis | |
| } | |
| if (parallelism.tp * parallelism.ep > cluster.gpusPerNode) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `TP × EP requires ${parallelism.tp * parallelism.ep} GPUs per node, but nodes only have ${cluster.gpusPerNode}.` | |
| return analysis | |
| } | |
| if (!derivedParallelism) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `World size ${totalGPUs} must be divisible by TP × PP × CP × EP, and the FSDP shard group must divide the cluster cleanly.` | |
| return analysis | |
| } | |
| if (model.hiddenDim % model.numHeads !== 0) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `hiddenDim ${model.hiddenDim} must divide evenly across ${model.numHeads} attention heads.` | |
| return analysis | |
| } | |
| if (model.numHeads % parallelism.tp !== 0) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `TP ${parallelism.tp} must divide the ${model.numHeads} attention heads.` | |
| return analysis | |
| } | |
| if (model.numKVHeads % parallelism.tp !== 0) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `TP ${parallelism.tp} should divide the ${model.numKVHeads} KV heads for clean GQA sharding.` | |
| return analysis | |
| } | |
| if (training.seqLength % parallelism.cp !== 0) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `CP ${parallelism.cp} must divide the sequence length ${training.seqLength}.` | |
| return analysis | |
| } | |
| if (model.architecture === 'moe' && !model.moe) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = 'MoE models require expert metadata.' | |
| return analysis | |
| } | |
| if (model.architecture === 'moe' && model.moe && model.moe.numExperts % parallelism.ep !== 0) { | |
| const analysis = emptyAnalysis() | |
| analysis.infeasibilityReason = | |
| `EP ${parallelism.ep} must divide the ${model.moe.numExperts} experts.` | |
| return analysis | |
| } | |
| const { modelBreakdown, layerDistribution, stageMemory, stageParameters } = getStageMemoryMap( | |
| model, | |
| training, | |
| parallelism, | |
| derivedParallelism, | |
| ) | |
| const placement = buildPlacement(cluster, parallelism, derivedParallelism, totalGPUs) | |
| const maxStageLayers = Math.max(...layerDistribution.map((stage) => stage.numLayers), 0) | |
| const pipelineStages = layerDistribution.map((stage) => { | |
| const stageMemoryBreakdown = stageMemory.get(stage.stageIndex) | |
| const stageParameterCount = stageParameters.get(stage.stageIndex) | |
| return { | |
| stageIndex: stage.stageIndex, | |
| layerRange: [stage.startLayer, stage.endLayer] as [number, number], | |
| numLayers: stage.numLayers, | |
| memoryGB: round2( | |
| (stageMemoryBreakdown?.totalGB ?? 0) * | |
| parallelism.tp * | |
| parallelism.cp * | |
| parallelism.ep * | |
| derivedParallelism.dp, | |
| ), | |
| hasEmbedding: stageParameterCount?.hasEmbedding ?? false, | |
| hasOutputHead: stageParameterCount?.hasOutputHead ?? false, | |
| } | |
| }) | |
| const worstStageIndex = pipelineStages.reduce((worstIndex, stage) => { | |
| const worstStageMemory = stageMemory.get(worstIndex)?.totalGB ?? 0 | |
| const candidateStageMemory = stageMemory.get(stage.stageIndex)?.totalGB ?? 0 | |
| return candidateStageMemory > worstStageMemory ? stage.stageIndex : worstIndex | |
| }, 0) | |
| const worstStageMemory = stageMemory.get(worstStageIndex) ?? { | |
| parametersGB: 0, | |
| optimizerStatesGB: 0, | |
| gradientsGB: 0, | |
| activationsGB: 0, | |
| totalGB: 0, | |
| } | |
| const pipelineBubbleFraction = | |
| parallelism.pp <= 1 | |
| ? 0 | |
| : (parallelism.pp - 1) / (training.gradAccumSteps + parallelism.pp - 1) | |
| const boundaryStageCount = Math.min( | |
| parallelism.pp, | |
| Math.max(0, Math.round(pipelineBubbleFraction * parallelism.pp)), | |
| ) | |
| const gpuMap = placement.map((entry) => { | |
| const stageMemoryBreakdown = | |
| entry.ppStage >= 0 | |
| ? stageMemory.get(entry.ppStage) ?? { | |
| parametersGB: 0, | |
| optimizerStatesGB: 0, | |
| gradientsGB: 0, | |
| activationsGB: 0, | |
| totalGB: 0, | |
| } | |
| : { | |
| parametersGB: 0, | |
| optimizerStatesGB: 0, | |
| gradientsGB: 0, | |
| activationsGB: 0, | |
| totalGB: 0, | |
| } | |
| const bubbleIdle = entry.ppStage >= parallelism.pp - boundaryStageCount && entry.ppStage >= 0 | |
| return { | |
| globalGPUIndex: entry.globalGPUIndex, | |
| nodeIndex: entry.nodeIndex, | |
| localGPUIndex: entry.localGPUIndex, | |
| tpGroup: entry.tpGroup, | |
| tpLane: entry.tpLane, | |
| ppStage: entry.ppStage, | |
| cpShard: entry.cpShard, | |
| epLane: entry.epLane, | |
| dpReplica: entry.dpReplica, | |
| replicaGroup: entry.replicaGroup, | |
| fsdpRank: entry.fsdpRank, | |
| memoryUsedGB: round2(entry.isActive ? stageMemoryBreakdown.totalGB : 0), | |
| memoryCapacityGB: cluster.gpuType.hbmCapacityGB, | |
| isActive: entry.isActive && !bubbleIdle, | |
| } | |
| }) | |
| const activationBytes = getActivationBytes(training.precision) | |
| const shardedSequenceLength = training.seqLength / parallelism.cp | |
| const tokensPerMicroBatchShard = training.microBatchSize * shardedSequenceLength | |
| const collectiveMessageBytes = | |
| tokensPerMicroBatchShard * model.hiddenDim * activationBytes | |
| const attentionComputeMultiplier = 0.65 + 0.35 * getAttentionMultiplier(model, training.seqLength) | |
| const activationCheckpointComputeMultiplier = training.activationCheckpointing ? 1.2 : 1 | |
| const totalFlopsPerStep = | |
| 6 * | |
| modelBreakdown.activeParamsPerToken * | |
| training.microBatchSize * | |
| training.seqLength * | |
| training.gradAccumSteps * | |
| derivedParallelism.dp * | |
| attentionComputeMultiplier * | |
| activationCheckpointComputeMultiplier | |
| const launchedGPUs = Math.max(totalGPUs, 1) | |
| const flopsPerGpuPerStep = totalFlopsPerStep / launchedGPUs | |
| const peakTFLOPs = getPeakTFLOPsForPrecision(cluster.gpuType, training.precision) | |
| const sustainedTFLOPs = peakTFLOPs * getSustainedComputeEfficiency(training) | |
| const computeTimePerStepMs = (flopsPerGpuPerStep / (sustainedTFLOPs * 1e12)) * 1000 | |
| const pipelineBubbleTimeMs = | |
| pipelineBubbleFraction >= 1 | |
| ? 0 | |
| : (computeTimePerStepMs * pipelineBubbleFraction) / (1 - pipelineBubbleFraction) | |
| const tentativeTotalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs | |
| const tpMembers = placement.filter( | |
| (entry) => | |
| entry.dpReplica === 0 && | |
| entry.ppStage === 0 && | |
| entry.cpShard === 0 && | |
| entry.epLane === 0 && | |
| entry.tpLane >= 0, | |
| ) | |
| const tpStats = getRingCommStats({ | |
| groupCount: parallelism.pp * parallelism.cp * parallelism.ep * derivedParallelism.dp, | |
| groupWidth: parallelism.tp, | |
| messageBytes: collectiveMessageBytes, | |
| collectiveCount: TP_ALL_REDUCES_PER_LAYER * maxStageLayers * training.gradAccumSteps, | |
| membersForBandwidth: tpMembers, | |
| cluster, | |
| totalStepTimeMs: tentativeTotalStepTimeMs, | |
| }) | |
| const cpMembers = placement.filter( | |
| (entry) => | |
| entry.dpReplica === 0 && | |
| entry.ppStage === 0 && | |
| entry.epLane === 0 && | |
| entry.tpLane === 0 && | |
| entry.cpShard >= 0, | |
| ) | |
| const cpStats = getRingCommStats({ | |
| groupCount: parallelism.pp * derivedParallelism.dp * parallelism.tp * parallelism.ep, | |
| groupWidth: parallelism.cp, | |
| messageBytes: collectiveMessageBytes, | |
| collectiveCount: CP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps, | |
| membersForBandwidth: cpMembers, | |
| cluster, | |
| totalStepTimeMs: tentativeTotalStepTimeMs, | |
| }) | |
| const averageSharedLayerParams = | |
| model.numLayers > 0 | |
| ? (modelBreakdown.denseLayerCount * modelBreakdown.sharedDenseLayerParams + | |
| modelBreakdown.moeLayerCount * modelBreakdown.sharedMoeLayerParams) / | |
| model.numLayers | |
| : 0 | |
| const fsdpMessageBytes = | |
| parallelism.zeroStage >= 3 && derivedParallelism.fsdpDataParallelDegree > 1 | |
| ? (averageSharedLayerParams / parallelism.tp / derivedParallelism.fsdpDataParallelDegree) * | |
| getParameterBytes(training.precision) | |
| : 0 | |
| const fsdpMembers = placement.filter( | |
| (entry) => | |
| entry.replicaGroup === 0 && | |
| entry.ppStage === 0 && | |
| entry.cpShard === 0 && | |
| entry.epLane === 0 && | |
| entry.tpLane === 0, | |
| ) | |
| const fsdpStats = getRingCommStats({ | |
| groupCount: | |
| derivedParallelism.replicaGroups * | |
| parallelism.pp * | |
| parallelism.cp * | |
| parallelism.ep * | |
| parallelism.tp, | |
| groupWidth: derivedParallelism.fsdpDataParallelDegree, | |
| messageBytes: fsdpMessageBytes, | |
| collectiveCount: FSDP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps, | |
| membersForBandwidth: fsdpMembers, | |
| cluster, | |
| totalStepTimeMs: tentativeTotalStepTimeMs, | |
| }) | |
| const epMembers = placement.filter( | |
| (entry) => | |
| entry.dpReplica === 0 && | |
| entry.ppStage === 0 && | |
| entry.cpShard === 0 && | |
| entry.tpLane === 0 && | |
| entry.epLane >= 0, | |
| ) | |
| const moeLayerCount = modelBreakdown.moeLayerCount | |
| const epMessageBytes = | |
| model.architecture === 'moe' && model.moe | |
| ? tokensPerMicroBatchShard * | |
| model.hiddenDim * | |
| activationBytes * | |
| model.moe.expertsPerToken | |
| : 0 | |
| const epTransferCount = EP_ALL_TO_ALLS_PER_LAYER * moeLayerCount * training.gradAccumSteps | |
| const epStats = (() => { | |
| if (parallelism.ep <= 1 || epTransferCount <= 0 || epMessageBytes <= 0) { | |
| return { | |
| totalVolumeBytes: 0, | |
| timePerStepMs: 0, | |
| linkUtilizationPercent: 0, | |
| usesInterNode: false, | |
| } | |
| } | |
| const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(epMembers, cluster) | |
| const volumeBytesPerGpu = epMessageBytes * epTransferCount * 2 | |
| const totalVolumeBytes = | |
| volumeBytesPerGpu * | |
| parallelism.ep * | |
| parallelism.pp * | |
| parallelism.cp * | |
| parallelism.tp * | |
| derivedParallelism.dp | |
| const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000 | |
| const linkUtilizationPercent = | |
| tentativeTotalStepTimeMs > 0 | |
| ? clamp( | |
| (bytesToGB(volumeBytesPerGpu) / | |
| (bandwidthGBs * (tentativeTotalStepTimeMs / 1000))) * | |
| 100, | |
| 0, | |
| 100, | |
| ) | |
| : 0 | |
| return { | |
| totalVolumeBytes, | |
| timePerStepMs, | |
| linkUtilizationPercent, | |
| usesInterNode, | |
| } | |
| })() | |
| let ppTotalVolumeBytes = 0 | |
| let ppTimePerStepMs = 0 | |
| let ppUsesInterNode = false | |
| for (let dpReplica = 0; dpReplica < derivedParallelism.dp; dpReplica += 1) { | |
| for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { | |
| for (let stageIndex = 0; stageIndex < parallelism.pp - 1; stageIndex += 1) { | |
| const source = getPlacementEntry(placement, { | |
| dpReplica, | |
| ppStage: stageIndex, | |
| cpShard, | |
| epLane: 0, | |
| tpLane: 0, | |
| }) | |
| const target = getPlacementEntry(placement, { | |
| dpReplica, | |
| ppStage: stageIndex + 1, | |
| cpShard, | |
| epLane: 0, | |
| tpLane: 0, | |
| }) | |
| if (!source || !target) { | |
| continue | |
| } | |
| const usesInterNode = source.nodeIndex !== target.nodeIndex | |
| const bandwidthGBs = usesInterNode | |
| ? cluster.interNodeBandwidthGBs | |
| : cluster.intraNodeBandwidthGBs | |
| const perLaneBytes = collectiveMessageBytes / parallelism.tp | |
| ppUsesInterNode ||= usesInterNode | |
| ppTotalVolumeBytes += collectiveMessageBytes * 2 * training.gradAccumSteps | |
| ppTimePerStepMs += | |
| (bytesToGB(perLaneBytes) / bandwidthGBs) * 1000 * 2 * training.gradAccumSteps | |
| } | |
| } | |
| } | |
| const maxStageGradientBytes = Math.max( | |
| ...Array.from(stageMemory.values()).map((stage) => stage.gradientsGB * BYTES_PER_GB), | |
| 0, | |
| ) | |
| const dpGroupWidth = | |
| parallelism.fsdpShardGroupSize > 1 | |
| ? derivedParallelism.replicaGroups | |
| : derivedParallelism.dp | |
| const dpMembers = parallelism.fsdpShardGroupSize > 1 | |
| ? placement.filter( | |
| (entry) => | |
| entry.fsdpRank === 0 && | |
| entry.ppStage === 0 && | |
| entry.cpShard === 0 && | |
| entry.epLane === 0 && | |
| entry.tpLane === 0, | |
| ) | |
| : placement.filter( | |
| (entry) => | |
| entry.ppStage === 0 && | |
| entry.cpShard === 0 && | |
| entry.epLane === 0 && | |
| entry.tpLane === 0, | |
| ) | |
| const gradientCommBytesPerGpu = | |
| dpGroupWidth > 1 | |
| ? (2 * (dpGroupWidth - 1) * maxStageGradientBytes) / dpGroupWidth | |
| : 0 | |
| const dpBandwidth = getMaxBandwidthForCollective(dpMembers, cluster) | |
| const dpTimeMs = | |
| dpGroupWidth > 1 | |
| ? (bytesToGB(gradientCommBytesPerGpu) / dpBandwidth.bandwidthGBs) * 1000 | |
| : 0 | |
| const canOverlapDp = dpGroupWidth > 1 && (parallelism.pp > 1 || training.gradAccumSteps > 1) | |
| const dpNonOverlappedTimeMs = dpTimeMs * (canOverlapDp ? 0.35 : 1) | |
| const communicationTimePerStepMs = | |
| tpStats.timePerStepMs + | |
| cpStats.timePerStepMs + | |
| fsdpStats.timePerStepMs + | |
| epStats.timePerStepMs + | |
| ppTimePerStepMs + | |
| dpNonOverlappedTimeMs | |
| const totalStepTimeMs = | |
| computeTimePerStepMs + pipelineBubbleTimeMs + communicationTimePerStepMs | |
| const tokensPerSecond = | |
| totalStepTimeMs > 0 ? globalBatchSizeTokens / (totalStepTimeMs / 1000) : 0 | |
| const mfu = | |
| tokensPerSecond > 0 | |
| ? clamp( | |
| (6 * modelBreakdown.activeParamsPerToken * attentionComputeMultiplier * tokensPerSecond) / | |
| (launchedGPUs * peakTFLOPs * 1e12), | |
| 0, | |
| 1, | |
| ) | |
| : 0 | |
| const dpLinkUtilizationPercent = | |
| dpGroupWidth > 1 && totalStepTimeMs > 0 | |
| ? clamp( | |
| (bytesToGB(gradientCommBytesPerGpu) / | |
| (dpBandwidth.bandwidthGBs * (totalStepTimeMs / 1000))) * | |
| 100, | |
| 0, | |
| 100, | |
| ) | |
| : 0 | |
| const ppPerLaneVolumeGB = | |
| parallelism.pp > 1 | |
| ? bytesToGB(collectiveMessageBytes / parallelism.tp) * 2 * training.gradAccumSteps | |
| : 0 | |
| const ppLinkUtilizationPercent = | |
| parallelism.pp > 1 && totalStepTimeMs > 0 | |
| ? clamp( | |
| (ppPerLaneVolumeGB / | |
| ((ppUsesInterNode | |
| ? cluster.interNodeBandwidthGBs | |
| : cluster.intraNodeBandwidthGBs) * | |
| (totalStepTimeMs / 1000))) * | |
| 100, | |
| 0, | |
| 100, | |
| ) | |
| : 0 | |
| const links: ClusterAnalysis['links'] = [] | |
| const visualReplicaSamples = Math.min(derivedParallelism.dp, 12) | |
| const sampledDpReplicas = Array.from({ length: visualReplicaSamples }, (_, sampleIndex) => | |
| Math.floor((sampleIndex * derivedParallelism.dp) / visualReplicaSamples), | |
| ) | |
| for (const dpReplica of sampledDpReplicas) { | |
| for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) { | |
| for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { | |
| for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { | |
| const tpEntries = placement | |
| .filter( | |
| (entry) => | |
| entry.dpReplica === dpReplica && | |
| entry.ppStage === ppStage && | |
| entry.cpShard === cpShard && | |
| entry.epLane === epLane, | |
| ) | |
| .sort((left, right) => left.tpLane - right.tpLane) | |
| if (parallelism.tp > 1) { | |
| for (let lane = 0; lane < tpEntries.length; lane += 1) { | |
| const from = tpEntries[lane] | |
| const to = tpEntries[(lane + 1) % tpEntries.length] | |
| links.push({ | |
| fromGPU: from.globalGPUIndex, | |
| toGPU: to.globalGPUIndex, | |
| type: 'nvlink', | |
| trafficType: 'tp', | |
| volumeGB: round2(bytesToGB(tpStats.volumeBytesPerGpu)), | |
| utilizationPercent: round2(tpStats.linkUtilizationPercent), | |
| }) | |
| } | |
| } | |
| if (ppStage < parallelism.pp - 1) { | |
| const nextTpEntries = placement | |
| .filter( | |
| (entry) => | |
| entry.dpReplica === dpReplica && | |
| entry.ppStage === ppStage + 1 && | |
| entry.cpShard === cpShard && | |
| entry.epLane === epLane, | |
| ) | |
| .sort((left, right) => left.tpLane - right.tpLane) | |
| for (let lane = 0; lane < Math.min(tpEntries.length, nextTpEntries.length); lane += 1) { | |
| const from = tpEntries[lane] | |
| const to = nextTpEntries[lane] | |
| links.push({ | |
| fromGPU: from.globalGPUIndex, | |
| toGPU: to.globalGPUIndex, | |
| type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', | |
| trafficType: 'pp', | |
| volumeGB: round2(ppPerLaneVolumeGB), | |
| utilizationPercent: round2(ppLinkUtilizationPercent), | |
| }) | |
| } | |
| } | |
| } | |
| } | |
| if (parallelism.cp > 1) { | |
| for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { | |
| for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { | |
| const cpEntries = placement | |
| .filter( | |
| (entry) => | |
| entry.dpReplica === dpReplica && | |
| entry.ppStage === ppStage && | |
| entry.epLane === epLane && | |
| entry.tpLane === tpLane, | |
| ) | |
| .sort((left, right) => left.cpShard - right.cpShard) | |
| for (let shardIndex = 0; shardIndex < cpEntries.length; shardIndex += 1) { | |
| const from = cpEntries[shardIndex] | |
| const to = cpEntries[(shardIndex + 1) % cpEntries.length] | |
| links.push({ | |
| fromGPU: from.globalGPUIndex, | |
| toGPU: to.globalGPUIndex, | |
| type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', | |
| trafficType: 'cp', | |
| volumeGB: round2(bytesToGB(cpStats.volumeBytesPerGpu)), | |
| utilizationPercent: round2(cpStats.linkUtilizationPercent), | |
| }) | |
| } | |
| } | |
| } | |
| } | |
| if (parallelism.ep > 1) { | |
| for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { | |
| for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { | |
| const epEntries = placement | |
| .filter( | |
| (entry) => | |
| entry.dpReplica === dpReplica && | |
| entry.ppStage === ppStage && | |
| entry.cpShard === cpShard && | |
| entry.tpLane === tpLane, | |
| ) | |
| .sort((left, right) => left.epLane - right.epLane) | |
| for (let lane = 0; lane < epEntries.length; lane += 1) { | |
| const from = epEntries[lane] | |
| const to = epEntries[(lane + 1) % epEntries.length] | |
| links.push({ | |
| fromGPU: from.globalGPUIndex, | |
| toGPU: to.globalGPUIndex, | |
| type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', | |
| trafficType: 'ep', | |
| volumeGB: round2( | |
| epStats.totalVolumeBytes > 0 | |
| ? bytesToGB(epStats.totalVolumeBytes) / | |
| (parallelism.ep * | |
| Math.max(parallelism.tp * parallelism.cp * parallelism.pp * derivedParallelism.dp, 1)) | |
| : 0, | |
| ), | |
| utilizationPercent: round2(epStats.linkUtilizationPercent), | |
| }) | |
| } | |
| } | |
| } | |
| } | |
| if (derivedParallelism.fsdpDataParallelDegree > 1) { | |
| for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { | |
| for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { | |
| for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { | |
| const fsdpEntries = placement | |
| .filter( | |
| (entry) => | |
| entry.replicaGroup === placement.find((item) => item.dpReplica === dpReplica)?.replicaGroup && | |
| entry.ppStage === ppStage && | |
| entry.cpShard === cpShard && | |
| entry.epLane === epLane && | |
| entry.tpLane === tpLane, | |
| ) | |
| .sort((left, right) => left.fsdpRank - right.fsdpRank) | |
| for (let rank = 0; rank < fsdpEntries.length; rank += 1) { | |
| const from = fsdpEntries[rank] | |
| const to = fsdpEntries[(rank + 1) % fsdpEntries.length] | |
| links.push({ | |
| fromGPU: from.globalGPUIndex, | |
| toGPU: to.globalGPUIndex, | |
| type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', | |
| trafficType: 'fsdp', | |
| volumeGB: round2(bytesToGB(fsdpStats.volumeBytesPerGpu)), | |
| utilizationPercent: round2(fsdpStats.linkUtilizationPercent), | |
| }) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| if (dpGroupWidth > 1) { | |
| for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) { | |
| for (let epLane = 0; epLane < parallelism.ep; epLane += 1) { | |
| for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) { | |
| const current = placement.find((entry) => entry.dpReplica === dpReplica) | |
| if (!current) { | |
| continue | |
| } | |
| const from = getPlacementEntry(placement, { | |
| replicaGroup: | |
| parallelism.fsdpShardGroupSize > 1 ? current.replicaGroup : undefined, | |
| fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined, | |
| dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : dpReplica, | |
| ppStage, | |
| cpShard, | |
| epLane, | |
| tpLane, | |
| }) | |
| const to = getPlacementEntry(placement, { | |
| replicaGroup: | |
| parallelism.fsdpShardGroupSize > 1 | |
| ? (current.replicaGroup + 1) % derivedParallelism.replicaGroups | |
| : undefined, | |
| fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined, | |
| dpReplica: | |
| parallelism.fsdpShardGroupSize > 1 | |
| ? undefined | |
| : (dpReplica + 1) % derivedParallelism.dp, | |
| ppStage, | |
| cpShard, | |
| epLane, | |
| tpLane, | |
| }) | |
| if (!from || !to) { | |
| continue | |
| } | |
| links.push({ | |
| fromGPU: from.globalGPUIndex, | |
| toGPU: to.globalGPUIndex, | |
| type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband', | |
| trafficType: 'dp', | |
| volumeGB: round2(bytesToGB(gradientCommBytesPerGpu)), | |
| utilizationPercent: round2(dpLinkUtilizationPercent), | |
| }) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| const feasible = worstStageMemory.totalGB <= cluster.gpuType.hbmCapacityGB | |
| const infeasibilityReason = feasible | |
| ? undefined | |
| : `Stage ${worstStageIndex} uses ${round2(worstStageMemory.totalGB)} GB per GPU, exceeding ${cluster.gpuType.hbmCapacityGB} GB of HBM.` | |
| return { | |
| feasible, | |
| infeasibilityReason, | |
| totalParams: Math.round(modelBreakdown.totalParams), | |
| activeParamsPerToken: Math.round(modelBreakdown.activeParamsPerToken), | |
| globalBatchSizeTokens, | |
| totalGPUs, | |
| derivedParallelism: { | |
| dp: derivedParallelism.dp, | |
| replicaGroups: derivedParallelism.replicaGroups, | |
| fsdpShardGroupSize: parallelism.fsdpShardGroupSize, | |
| fsdpGroupSize: derivedParallelism.fsdpGroupSize, | |
| ep: parallelism.ep, | |
| }, | |
| memoryBreakdown: { | |
| parametersGB: round2(worstStageMemory.parametersGB), | |
| optimizerStatesGB: round2(worstStageMemory.optimizerStatesGB), | |
| gradientsGB: round2(worstStageMemory.gradientsGB), | |
| activationsGB: round2(worstStageMemory.activationsGB), | |
| totalGB: round2(worstStageMemory.totalGB), | |
| hbmCapacityGB: cluster.gpuType.hbmCapacityGB, | |
| utilizationPercent: round2( | |
| (worstStageMemory.totalGB / cluster.gpuType.hbmCapacityGB) * 100, | |
| ), | |
| }, | |
| pipelineStages, | |
| communication: { | |
| tp: { | |
| allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER, | |
| messageSizeBytes: collectiveMessageBytes, | |
| totalVolumePerStepGB: round2(bytesToGB(tpStats.totalVolumeBytes)), | |
| timePerStepMs: round2(tpStats.timePerStepMs), | |
| linkUtilizationPercent: round2(tpStats.linkUtilizationPercent), | |
| }, | |
| pp: { | |
| activationMessageSizeBytes: collectiveMessageBytes, | |
| numP2PTransfersPerStep: | |
| parallelism.pp > 1 | |
| ? 2 * | |
| (parallelism.pp - 1) * | |
| training.gradAccumSteps * | |
| parallelism.cp * | |
| parallelism.tp * | |
| derivedParallelism.dp | |
| : 0, | |
| totalVolumePerStepGB: round2(bytesToGB(ppTotalVolumeBytes)), | |
| timePerStepMs: round2(ppTimePerStepMs), | |
| usesInterNode: ppUsesInterNode, | |
| }, | |
| cp: { | |
| collectivesPerLayer: CP_COLLECTIVES_PER_LAYER, | |
| messageSizeBytes: collectiveMessageBytes, | |
| totalVolumePerStepGB: round2(bytesToGB(cpStats.totalVolumeBytes)), | |
| timePerStepMs: round2(cpStats.timePerStepMs), | |
| linkUtilizationPercent: round2(cpStats.linkUtilizationPercent), | |
| usesInterNode: cpStats.usesInterNode, | |
| }, | |
| fsdp: { | |
| collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER, | |
| messageSizeBytes: round2(fsdpMessageBytes), | |
| totalVolumePerStepGB: round2(bytesToGB(fsdpStats.totalVolumeBytes)), | |
| timePerStepMs: round2(fsdpStats.timePerStepMs), | |
| linkUtilizationPercent: round2(fsdpStats.linkUtilizationPercent), | |
| usesInterNode: fsdpStats.usesInterNode, | |
| }, | |
| ep: { | |
| allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER, | |
| messageSizeBytes: round2(epMessageBytes), | |
| totalVolumePerStepGB: round2(bytesToGB(epStats.totalVolumeBytes)), | |
| timePerStepMs: round2(epStats.timePerStepMs), | |
| linkUtilizationPercent: round2(epStats.linkUtilizationPercent), | |
| usesInterNode: epStats.usesInterNode, | |
| }, | |
| dp: { | |
| gradientVolumePerGPU_GB: round2(bytesToGB(gradientCommBytesPerGpu)), | |
| allReduceTimeMs: round2(dpTimeMs), | |
| canOverlapWithBackward: canOverlapDp, | |
| linkUtilizationPercent: round2(dpLinkUtilizationPercent), | |
| }, | |
| }, | |
| throughput: { | |
| computeTimePerStepMs: round2(computeTimePerStepMs), | |
| communicationTimePerStepMs: round2(communicationTimePerStepMs), | |
| pipelineBubbleFraction: round2(pipelineBubbleFraction), | |
| pipelineBubbleTimeMs: round2(pipelineBubbleTimeMs), | |
| totalStepTimeMs: round2(totalStepTimeMs), | |
| tokensPerSecond: round2(tokensPerSecond), | |
| mfu: round2(mfu), | |
| }, | |
| gpuMap, | |
| links, | |
| } | |
| } | |
| export const llama7B = (): ModelConfig => ({ | |
| architecture: 'dense', | |
| hiddenDim: 4096, | |
| numLayers: 32, | |
| numHeads: 32, | |
| numKVHeads: 32, | |
| vocabSize: 32000, | |
| intermediateSize: 11008, | |
| tiedEmbeddings: false, | |
| attentionProfile: { | |
| type: 'full', | |
| }, | |
| }) | |
| export const llama70B = (): ModelConfig => ({ | |
| architecture: 'dense', | |
| hiddenDim: 8192, | |
| numLayers: 80, | |
| numHeads: 64, | |
| numKVHeads: 8, | |
| vocabSize: 32000, | |
| intermediateSize: 28672, | |
| tiedEmbeddings: false, | |
| attentionProfile: { | |
| type: 'full', | |
| }, | |
| }) | |
| export const llama405B = (): ModelConfig => ({ | |
| architecture: 'dense', | |
| hiddenDim: 16384, | |
| numLayers: 126, | |
| numHeads: 128, | |
| numKVHeads: 8, | |
| vocabSize: 128256, | |
| intermediateSize: 53248, | |
| tiedEmbeddings: false, | |
| attentionProfile: { | |
| type: 'full', | |
| }, | |
| }) | |
| export const olmo3_32B = (): ModelConfig => ({ | |
| architecture: 'dense', | |
| hiddenDim: 5120, | |
| numLayers: 64, | |
| numHeads: 40, | |
| numKVHeads: 8, | |
| vocabSize: 100278, | |
| intermediateSize: 27648, | |
| tiedEmbeddings: false, | |
| attentionProfile: { | |
| type: 'hybrid', | |
| slidingWindowSize: 4096, | |
| globalAttentionFraction: 0.25, | |
| }, | |
| }) | |
| export const llama31_405B = (): ModelConfig => ({ | |
| architecture: 'dense', | |
| hiddenDim: 16384, | |
| numLayers: 126, | |
| numHeads: 128, | |
| numKVHeads: 8, | |
| vocabSize: 128256, | |
| intermediateSize: 53248, | |
| tiedEmbeddings: false, | |
| attentionProfile: { | |
| type: 'full', | |
| }, | |
| }) | |
| export const trinityLarge400B = (): ModelConfig => ({ | |
| architecture: 'moe', | |
| hiddenDim: 3072, | |
| numLayers: 60, | |
| numHeads: 48, | |
| numKVHeads: 8, | |
| vocabSize: 200192, | |
| intermediateSize: 12288, | |
| tiedEmbeddings: false, | |
| attentionProfile: { | |
| type: 'hybrid', | |
| slidingWindowSize: 4096, | |
| globalAttentionEveryN: 4, | |
| }, | |
| moe: { | |
| numExperts: 256, | |
| expertsPerToken: 4, | |
| numDenseLayers: 6, | |
| expertIntermediateSize: 3072, | |
| activeParamsPerToken: 13_000_000_000, | |
| }, | |
| }) | |
| export const a100_80gb = (): GPUSpec => ({ | |
| name: 'A100 80GB', | |
| hbmCapacityGB: 80, | |
| peakTFLOPsBF16: 312, | |
| memBandwidthTBs: 2, | |
| }) | |
| export const h100_sxm = (): GPUSpec => ({ | |
| name: 'H100 SXM', | |
| hbmCapacityGB: 80, | |
| peakTFLOPsBF16: 989, | |
| memBandwidthTBs: 3.35, | |
| }) | |
| export const b300 = (): GPUSpec => ({ | |
| name: 'B300', | |
| hbmCapacityGB: 192, | |
| peakTFLOPsBF16: 2250, | |
| memBandwidthTBs: 8, | |
| }) | |
| export const gb200 = (): GPUSpec => ({ | |
| name: 'GB200', | |
| hbmCapacityGB: 192, | |
| peakTFLOPsBF16: 2250, | |
| memBandwidthTBs: 8, | |
| }) | |
| export const singleNode8GPU = (gpuType: GPUSpec = a100_80gb()): ClusterConfig => { | |
| const fabric = getDefaultFabric(gpuType) | |
| return { | |
| gpuType, | |
| gpusPerNode: 8, | |
| numNodes: 1, | |
| intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs, | |
| interNodeBandwidthGBs: fabric.interNodeBandwidthGBs, | |
| nodesPerRack: 1, | |
| rackLabel: 'node', | |
| nodeLabel: 'GPU host', | |
| podLabel: 'node', | |
| } | |
| } | |
| export const cluster64GPU = (gpuType: GPUSpec = h100_sxm()): ClusterConfig => { | |
| const fabric = getDefaultFabric(gpuType) | |
| return { | |
| gpuType, | |
| gpusPerNode: 8, | |
| numNodes: 8, | |
| intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs, | |
| interNodeBandwidthGBs: fabric.interNodeBandwidthGBs, | |
| nodesPerRack: 4, | |
| rackLabel: 'rack', | |
| nodeLabel: 'GPU host', | |
| podLabel: 'rack', | |
| } | |
| } | |
| export const frontier576GPU = (): ClusterConfig => { | |
| const gpuType = gb200() | |
| const fabric = getDefaultFabric(gpuType) | |
| return { | |
| gpuType, | |
| gpusPerNode: 8, | |
| numNodes: 72, | |
| intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs, | |
| interNodeBandwidthGBs: fabric.interNodeBandwidthGBs, | |
| nodesPerRack: 9, | |
| rackLabel: 'NVL72 rack', | |
| nodeLabel: 'compute tray', | |
| podLabel: 'rack', | |
| } | |
| } | |