Spaces:

joeddav
/

illustrated-cluster

Sleeping

App Files Files Community

illustrated-cluster / src /lib /trainingClusterModel.ts

joeddav

Publish WIP HF Space snapshot

1f77aa7 10 days ago

raw

history blame contribute delete

57.6 kB

	export interface ModelConfig {
	architecture: 'dense' \| 'moe'
	hiddenDim: number
	numLayers: number
	numHeads: number
	numKVHeads: number
	vocabSize: number
	intermediateSize: number
	tiedEmbeddings: boolean
	attentionProfile?: {
	type: 'full' \| 'hybrid'
	slidingWindowSize?: number
	globalAttentionFraction?: number
	globalAttentionEveryN?: number
	}
	moe?: {
	numExperts: number
	expertsPerToken: number
	numDenseLayers: number
	expertIntermediateSize: number
	activeParamsPerToken?: number
	}
	}

	export interface TrainingConfig {
	microBatchSize: number
	seqLength: number
	gradAccumSteps: number
	precision: 'fp32' \| 'bf16' \| 'fp16' \| 'fp8'
	activationCheckpointing: boolean
	optimizer: 'adam' \| 'adamw' \| 'sgd' \| 'muon'
	}

	export interface GPUSpec {
	name: string
	hbmCapacityGB: number
	peakTFLOPsBF16: number
	memBandwidthTBs: number
	}

	export interface ClusterConfig {
	gpuType: GPUSpec
	gpusPerNode: number
	numNodes: number
	intraNodeBandwidthGBs: number
	interNodeBandwidthGBs: number
	nodesPerRack?: number
	rackLabel?: string
	nodeLabel?: string
	podLabel?: string
	}

	export interface ParallelismConfig {
	tp: number
	pp: number
	cp: number
	ep: number
	distributedOptimizer: boolean
	fsdpShardGroupSize: number
	zeroStage: 0 \| 1 \| 2 \| 3
	}

	export interface ClusterAnalysis {
	feasible: boolean
	infeasibilityReason?: string
	totalParams: number
	activeParamsPerToken: number
	globalBatchSizeTokens: number
	totalGPUs: number
	derivedParallelism: {
	dp: number
	replicaGroups: number
	fsdpShardGroupSize: number
	fsdpGroupSize: number
	ep: number
	}
	memoryBreakdown: {
	parametersGB: number
	optimizerStatesGB: number
	gradientsGB: number
	activationsGB: number
	totalGB: number
	hbmCapacityGB: number
	utilizationPercent: number
	}
	pipelineStages: {
	stageIndex: number
	layerRange: [number, number]
	numLayers: number
	memoryGB: number
	hasEmbedding: boolean
	hasOutputHead: boolean
	}[]
	communication: {
	tp: {
	allReducesPerLayer: number
	messageSizeBytes: number
	totalVolumePerStepGB: number
	timePerStepMs: number
	linkUtilizationPercent: number
	}
	pp: {
	activationMessageSizeBytes: number
	numP2PTransfersPerStep: number
	totalVolumePerStepGB: number
	timePerStepMs: number
	usesInterNode: boolean
	}
	cp: {
	collectivesPerLayer: number
	messageSizeBytes: number
	totalVolumePerStepGB: number
	timePerStepMs: number
	linkUtilizationPercent: number
	usesInterNode: boolean
	}
	fsdp: {
	collectivesPerLayer: number
	messageSizeBytes: number
	totalVolumePerStepGB: number
	timePerStepMs: number
	linkUtilizationPercent: number
	usesInterNode: boolean
	}
	ep: {
	allToAllsPerLayer: number
	messageSizeBytes: number
	totalVolumePerStepGB: number
	timePerStepMs: number
	linkUtilizationPercent: number
	usesInterNode: boolean
	}
	dp: {
	gradientVolumePerGPU_GB: number
	allReduceTimeMs: number
	canOverlapWithBackward: boolean
	linkUtilizationPercent: number
	}
	}
	throughput: {
	computeTimePerStepMs: number
	communicationTimePerStepMs: number
	pipelineBubbleFraction: number
	pipelineBubbleTimeMs: number
	totalStepTimeMs: number
	tokensPerSecond: number
	mfu: number
	}
	gpuMap: {
	globalGPUIndex: number
	nodeIndex: number
	localGPUIndex: number
	tpGroup: number
	tpLane: number
	ppStage: number
	cpShard: number
	epLane: number
	dpReplica: number
	replicaGroup: number
	fsdpRank: number
	memoryUsedGB: number
	memoryCapacityGB: number
	isActive: boolean
	}[]
	links: {
	fromGPU: number
	toGPU: number
	type: 'nvlink' \| 'infiniband'
	trafficType: 'tp' \| 'pp' \| 'cp' \| 'fsdp' \| 'ep' \| 'dp'
	volumeGB: number
	utilizationPercent: number
	}[]
	}

	type LayerDistribution = {
	stageIndex: number
	startLayer: number
	endLayer: number
	numLayers: number
	}

	type StageMemory = {
	parametersGB: number
	optimizerStatesGB: number
	gradientsGB: number
	activationsGB: number
	totalGB: number
	}

	type StageParameterCount = {
	stageParams: number
	sharedParams: number
	expertParams: number
	denseLayers: number
	moeLayers: number
	hasEmbedding: boolean
	hasOutputHead: boolean
	}

	type PlacementEntry = {
	globalGPUIndex: number
	nodeIndex: number
	localGPUIndex: number
	tpGroup: number
	tpLane: number
	ppStage: number
	cpShard: number
	epLane: number
	dpReplica: number
	replicaGroup: number
	fsdpRank: number
	isActive: boolean
	}

	type DerivedParallelism = {
	modelParallelSize: number
	dp: number
	replicaGroups: number
	fsdpGroupSize: number
	fsdpDataParallelDegree: number
	}

	type ModelBreakdown = ReturnType<typeof getModelBreakdown>

	type RingCommStats = {
	volumeBytesPerGpu: number
	totalVolumeBytes: number
	timePerStepMs: number
	linkUtilizationPercent: number
	usesInterNode: boolean
	}

	const BYTES_PER_GB = 1e9
	const TP_ALL_REDUCES_PER_LAYER = 4
	const CP_COLLECTIVES_PER_LAYER = 2
	const FSDP_COLLECTIVES_PER_LAYER = 4
	const EP_ALL_TO_ALLS_PER_LAYER = 2
	const DEFAULT_BF16_EFFICIENCY = 0.56

	const clamp = (value: number, min: number, max: number) =>
	Math.min(Math.max(value, min), max)

	const bytesToGB = (bytes: number) => bytes / BYTES_PER_GB

	const round2 = (value: number) => Math.round(value * 100) / 100

	const getParameterBytes = (precision: TrainingConfig['precision']) => {
	switch (precision) {
	case 'fp32':
	return 4
	case 'fp8':
	return 1
	default:
	return 2
	}
	}

	const getActivationBytes = (precision: TrainingConfig['precision']) =>
	precision === 'fp32' ? 4 : 2

	const getGradientBytes = (precision: TrainingConfig['precision']) =>
	precision === 'fp32' ? 4 : 2

	const getOptimizerBytesPerParam = (
	optimizer: TrainingConfig['optimizer'],
	precision: TrainingConfig['precision'],
	) => {
	if (optimizer === 'sgd') {
	return 4
	}

	// Muon keeps lower optimizer state than Adam-family optimizers in practice.
	// We model it as 8 bytes per parameter of extra state on top of bf16 weights.
	if (optimizer === 'muon') {
	return 8
	}

	return precision === 'fp32' ? 8 : 12
	}

	const getPeakTFLOPsForPrecision = (gpu: GPUSpec, precision: TrainingConfig['precision']) => {
	switch (precision) {
	case 'fp32':
	return gpu.peakTFLOPsBF16 * 0.25
	case 'fp8':
	return gpu.peakTFLOPsBF16 * 2
	default:
	return gpu.peakTFLOPsBF16
	}
	}

	const getSustainedComputeEfficiency = (training: TrainingConfig) => {
	const checkpointPenalty = training.activationCheckpointing ? 0.02 : 0
	const fp32Penalty = training.precision === 'fp32' ? 0.08 : 0
	const moeBoost = training.optimizer === 'muon' ? 0.02 : 0

	return clamp(DEFAULT_BF16_EFFICIENCY - checkpointPenalty - fp32Penalty + moeBoost, 0.3, 0.62)
	}

	const distributeLayers = (numLayers: number, pp: number): LayerDistribution[] => {
	const baseLayers = Math.floor(numLayers / pp)
	const remainder = numLayers % pp
	let startLayer = 0

	return Array.from({ length: pp }, (_, stageIndex) => {
	const stageLayers = baseLayers + (stageIndex < remainder ? 1 : 0)
	const endLayer = startLayer + stageLayers - 1
	const distribution = {
	stageIndex,
	startLayer,
	endLayer,
	numLayers: stageLayers,
	}

	startLayer += stageLayers
	return distribution
	})
	}

	const getDefaultFabric = (gpu: GPUSpec) => {
	const normalizedName = gpu.name.toLowerCase()

	if (normalizedName.includes('gb200')) {
	return {
	intraNodeBandwidthGBs: 900,
	interNodeBandwidthGBs: 100,
	}
	}

	if (normalizedName.includes('h100')) {
	return {
	intraNodeBandwidthGBs: 450,
	interNodeBandwidthGBs: 100,
	}
	}

	return {
	intraNodeBandwidthGBs: 300,
	interNodeBandwidthGBs: 50,
	}
	}

	const getModelBreakdown = (model: ModelConfig) => {
	const headDim = model.hiddenDim / model.numHeads
	const embeddingParams = model.vocabSize * model.hiddenDim
	const kvProjectionDim = model.numKVHeads * headDim

	const perLayerAttentionParams =
	model.hiddenDim * (model.hiddenDim + 2 * kvProjectionDim + model.hiddenDim)
	const perLayerDenseMlpParams = model.hiddenDim * model.intermediateSize * 3
	const perLayerNormParams = model.hiddenDim * 2
	const finalNormParams = model.hiddenDim
	const outputHeadParams = model.tiedEmbeddings ? 0 : embeddingParams
	const perExpertParams =
	model.architecture === 'moe' && model.moe
	? model.hiddenDim * model.moe.expertIntermediateSize * 3
	: 0
	const totalExpertParamsPerLayer =
	model.architecture === 'moe' && model.moe ? perExpertParams * model.moe.numExperts : 0
	const denseLayerCount =
	model.architecture === 'moe' && model.moe ? model.moe.numDenseLayers : model.numLayers
	const moeLayerCount = model.numLayers - denseLayerCount
	const sharedDenseLayerParams =
	perLayerAttentionParams + perLayerDenseMlpParams + perLayerNormParams
	const sharedMoeLayerParams = perLayerAttentionParams + perLayerNormParams
	const sharedParams =
	embeddingParams +
	denseLayerCount * sharedDenseLayerParams +
	moeLayerCount * sharedMoeLayerParams +
	finalNormParams +
	outputHeadParams
	const totalParams = sharedParams + moeLayerCount * totalExpertParamsPerLayer
	const derivedActiveParams =
	model.architecture === 'moe' && model.moe
	? embeddingParams +
	denseLayerCount * sharedDenseLayerParams +
	moeLayerCount *
	(sharedMoeLayerParams + model.moe.expertsPerToken * perExpertParams) +
	finalNormParams +
	outputHeadParams
	: totalParams
	const activeParamsPerToken =
	model.architecture === 'moe' && model.moe?.activeParamsPerToken != null
	? model.moe.activeParamsPerToken
	: derivedActiveParams

	const perLayerTotalParams =
	model.architecture === 'moe'
	? sharedMoeLayerParams + totalExpertParamsPerLayer
	: sharedDenseLayerParams

	return {
	headDim,
	kvProjectionDim,
	embeddingParams,
	perLayerAttentionParams,
	perLayerDenseMlpParams,
	perLayerNormParams,
	perExpertParams,
	totalExpertParamsPerLayer,
	sharedDenseLayerParams,
	sharedMoeLayerParams,
	denseLayerCount,
	moeLayerCount,
	sharedParams,
	perLayerTotalParams,
	finalNormParams,
	outputHeadParams,
	totalParams,
	activeParamsPerToken,
	}
	}

	const getConcurrentMicroBatches = (
	training: TrainingConfig,
	parallelism: ParallelismConfig,
	) => {
	if (parallelism.pp <= 1) {
	return 1
	}

	return Math.max(1, Math.min(training.gradAccumSteps, parallelism.pp))
	}

	const getAttentionMultiplier = (model: ModelConfig, seqLength: number) => {
	const profile = model.attentionProfile
	if (!profile \|\| profile.type === 'full') {
	return 1
	}

	const windowMultiplier =
	profile.slidingWindowSize != null
	? clamp(profile.slidingWindowSize / seqLength, 0, 1)
	: 1
	const globalFraction =
	profile.globalAttentionFraction ??
	(profile.globalAttentionEveryN != null ? 1 / profile.globalAttentionEveryN : 0.25)

	return clamp(globalFraction + (1 - globalFraction) * windowMultiplier, windowMultiplier, 1)
	}

	const getStageLayerMix = (stage: LayerDistribution, model: ModelConfig) => {
	if (model.architecture !== 'moe' \|\| !model.moe) {
	return {
	denseLayers: stage.numLayers,
	moeLayers: 0,
	}
	}

	const denseEnd = model.moe.numDenseLayers - 1
	const denseLayers =
	denseEnd < stage.startLayer
	? 0
	: Math.max(0, Math.min(stage.endLayer, denseEnd) - stage.startLayer + 1)

	return {
	denseLayers,
	moeLayers: stage.numLayers - denseLayers,
	}
	}

	const getStageParameterCount = (
	stage: LayerDistribution,
	modelBreakdown: ModelBreakdown,
	parallelism: ParallelismConfig,
	model: ModelConfig,
	): StageParameterCount => {
	const layerMix = getStageLayerMix(stage, model)
	let sharedParams =
	layerMix.denseLayers * modelBreakdown.sharedDenseLayerParams +
	layerMix.moeLayers * modelBreakdown.sharedMoeLayerParams
	const expertParams = layerMix.moeLayers * modelBreakdown.totalExpertParamsPerLayer
	const hasEmbedding = stage.stageIndex === 0
	const hasOutputHead = stage.stageIndex === parallelism.pp - 1

	if (hasEmbedding) {
	sharedParams += modelBreakdown.embeddingParams
	}

	if (hasOutputHead) {
	sharedParams += modelBreakdown.finalNormParams + modelBreakdown.outputHeadParams
	}

	return {
	stageParams: sharedParams + expertParams,
	sharedParams,
	expertParams,
	denseLayers: layerMix.denseLayers,
	moeLayers: layerMix.moeLayers,
	hasEmbedding,
	hasOutputHead,
	}
	}

	const getActivationMemoryBytesPerLayer = ({
	model,
	training,
	parallelism,
	isMoeLayer,
	}: {
	model: ModelConfig
	training: TrainingConfig
	parallelism: ParallelismConfig
	isMoeLayer: boolean
	}) => {
	const activationBytes = getActivationBytes(training.precision)
	const shardedSequenceLength = training.seqLength / parallelism.cp
	const tokensPerShard = training.microBatchSize * shardedSequenceLength
	const kvHiddenDim = model.numKVHeads * (model.hiddenDim / model.numHeads)
	const tpSequenceShardFactor = parallelism.tp > 1 ? parallelism.tp : 1
	// Sequence parallelism shards the residual stream and checkpointed layer boundaries across
	// the TP group. We assume TP-enabled dense training uses this Megatron-style optimization.
	const hiddenStateBytes =
	(tokensPerShard * model.hiddenDim * activationBytes) / tpSequenceShardFactor
	const attentionMultiplier = getAttentionMultiplier(model, training.seqLength)

	// Sequence-parallel CP reduces the activation footprint by the number of sequence shards.
	const qkvBytes =
	tokensPerShard * (model.hiddenDim + 2 * kvHiddenDim) * activationBytes * attentionMultiplier
	const denseMlpBytes = tokensPerShard * model.intermediateSize * activationBytes * 2
	const moeMlpBytes =
	isMoeLayer && model.moe
	? (tokensPerShard *
	model.moe.expertIntermediateSize *
	activationBytes *
	model.moe.expertsPerToken *
	2) /
	Math.max(parallelism.ep, 1)
	: 0
	const shardedIntermediateBytes =
	(qkvBytes + (isMoeLayer ? moeMlpBytes : denseMlpBytes)) / Math.max(parallelism.tp, 1)

	if (training.activationCheckpointing) {
	return hiddenStateBytes * 2 + shardedIntermediateBytes * 0.25
	}

	return hiddenStateBytes * 6 + shardedIntermediateBytes * 2
	}

	const getStageMemory = (
	stageParams: StageParameterCount,
	model: ModelConfig,
	training: TrainingConfig,
	parallelism: ParallelismConfig,
	derivedParallelism: DerivedParallelism,
	) => {
	const parameterBytes = getParameterBytes(training.precision)
	const gradientBytes = getGradientBytes(training.precision)
	const optimizerBytes = getOptimizerBytesPerParam(training.optimizer, training.precision)
	const fsdpShardFactor =
	parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.fsdpDataParallelDegree : 1
	const distributedShardFactor = parallelism.distributedOptimizer ? derivedParallelism.dp : 1
	const parameterShardFactor =
	parallelism.zeroStage >= 3 ? fsdpShardFactor : 1
	const optimizerShardFactor =
	parallelism.zeroStage >= 1
	? parallelism.fsdpShardGroupSize > 1
	? fsdpShardFactor
	: distributedShardFactor
	: 1
	const gradientShardFactor =
	parallelism.zeroStage >= 2
	? parallelism.fsdpShardGroupSize > 1
	? fsdpShardFactor
	: derivedParallelism.dp
	: 1

	const sharedParamsLocal = stageParams.sharedParams / Math.max(parallelism.tp, 1)
	const expertParamsLocal =
	stageParams.expertParams / Math.max(parallelism.tp * parallelism.ep, 1)
	const parameterMemoryBytes =
	(sharedParamsLocal / parameterShardFactor + expertParamsLocal / parameterShardFactor) *
	parameterBytes
	const optimizerMemoryBytes =
	(sharedParamsLocal / optimizerShardFactor + expertParamsLocal / optimizerShardFactor) *
	optimizerBytes
	const gradientMemoryBytes =
	(sharedParamsLocal / gradientShardFactor + expertParamsLocal / gradientShardFactor) *
	gradientBytes

	const denseLayerActivationBytes = getActivationMemoryBytesPerLayer({
	model,
	training,
	parallelism,
	isMoeLayer: false,
	})
	const moeLayerActivationBytes = getActivationMemoryBytesPerLayer({
	model,
	training,
	parallelism,
	isMoeLayer: true,
	})
	const concurrentMicroBatches = getConcurrentMicroBatches(training, parallelism)
	let activationMemoryBytes =
	(denseLayerActivationBytes * stageParams.denseLayers +
	moeLayerActivationBytes * stageParams.moeLayers) *
	concurrentMicroBatches

	if (training.activationCheckpointing && stageParams.stageParams > 0) {
	activationMemoryBytes +=
	Math.max(denseLayerActivationBytes, moeLayerActivationBytes) * 1.5
	}

	const totalBytes =
	parameterMemoryBytes + optimizerMemoryBytes + gradientMemoryBytes + activationMemoryBytes

	return {
	parametersGB: bytesToGB(parameterMemoryBytes),
	optimizerStatesGB: bytesToGB(optimizerMemoryBytes),
	gradientsGB: bytesToGB(gradientMemoryBytes),
	activationsGB: bytesToGB(activationMemoryBytes),
	totalGB: bytesToGB(totalBytes),
	}
	}

	const getStageMemoryMap = (
	model: ModelConfig,
	training: TrainingConfig,
	parallelism: ParallelismConfig,
	derivedParallelism: DerivedParallelism,
	) => {
	const modelBreakdown = getModelBreakdown(model)
	const layerDistribution = distributeLayers(model.numLayers, parallelism.pp)
	const stageMemory = new Map<number, StageMemory>()
	const stageParameters = new Map<number, StageParameterCount>()

	for (const stage of layerDistribution) {
	const stageParameterCount = getStageParameterCount(stage, modelBreakdown, parallelism, model)
	stageParameters.set(stage.stageIndex, stageParameterCount)
	stageMemory.set(
	stage.stageIndex,
	getStageMemory(stageParameterCount, model, training, parallelism, derivedParallelism),
	)
	}

	return {
	modelBreakdown,
	layerDistribution,
	stageMemory,
	stageParameters,
	}
	}

	const buildPlacement = (
	cluster: ClusterConfig,
	parallelism: ParallelismConfig,
	derivedParallelism: DerivedParallelism,
	requiredGPUs: number,
	) => {
	const totalGPUs = cluster.gpusPerNode * cluster.numNodes
	const placement: PlacementEntry[] = []
	let nodeIndex = 0
	let localGPUIndex = 0
	let globalGPUIndex = 0

	for (let replicaGroup = 0; replicaGroup < derivedParallelism.replicaGroups; replicaGroup += 1) {
	for (let fsdpRank = 0; fsdpRank < derivedParallelism.fsdpDataParallelDegree; fsdpRank += 1) {
	const dpReplica = replicaGroup * derivedParallelism.fsdpDataParallelDegree + fsdpRank

	for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
	for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
	if (localGPUIndex + parallelism.ep * parallelism.tp > cluster.gpusPerNode) {
	nodeIndex += 1
	localGPUIndex = 0
	}

	for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
	for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
	placement.push({
	globalGPUIndex,
	nodeIndex,
	localGPUIndex,
	tpGroup:
	(((dpReplica * parallelism.pp + ppStage) * parallelism.cp + cpShard) *
	parallelism.ep) +
	epLane,
	tpLane,
	ppStage,
	cpShard,
	epLane,
	dpReplica,
	replicaGroup,
	fsdpRank,
	isActive: globalGPUIndex < requiredGPUs,
	})

	globalGPUIndex += 1
	localGPUIndex += 1
	}
	}
	}
	}
	}
	}

	while (placement.length < totalGPUs) {
	if (localGPUIndex >= cluster.gpusPerNode) {
	nodeIndex += 1
	localGPUIndex = 0
	}

	placement.push({
	globalGPUIndex,
	nodeIndex,
	localGPUIndex,
	tpGroup: -1,
	tpLane: -1,
	ppStage: -1,
	cpShard: -1,
	epLane: -1,
	dpReplica: -1,
	replicaGroup: -1,
	fsdpRank: -1,
	isActive: false,
	})
	globalGPUIndex += 1
	localGPUIndex += 1
	}

	return placement
	}

	const getPlacementEntry = (
	placement: PlacementEntry[],
	filters: Partial<
	Pick<
	PlacementEntry,
	'dpReplica' \| 'replicaGroup' \| 'fsdpRank' \| 'ppStage' \| 'cpShard' \| 'epLane' \| 'tpLane'
	>
	>,
	) =>
	placement.find(
	(entry) =>
	(filters.dpReplica == null \|\| entry.dpReplica === filters.dpReplica) &&
	(filters.replicaGroup == null \|\| entry.replicaGroup === filters.replicaGroup) &&
	(filters.fsdpRank == null \|\| entry.fsdpRank === filters.fsdpRank) &&
	(filters.ppStage == null \|\| entry.ppStage === filters.ppStage) &&
	(filters.cpShard == null \|\| entry.cpShard === filters.cpShard) &&
	(filters.epLane == null \|\| entry.epLane === filters.epLane) &&
	(filters.tpLane == null \|\| entry.tpLane === filters.tpLane),
	)

	const getDerivedParallelism = (
	cluster: ClusterConfig,
	parallelism: ParallelismConfig,
	): DerivedParallelism \| null => {
	const totalGPUs = cluster.gpusPerNode * cluster.numNodes
	const modelParallelSize =
	parallelism.tp * parallelism.pp * parallelism.cp * parallelism.ep

	if (modelParallelSize <= 0 \|\| totalGPUs % modelParallelSize !== 0) {
	return null
	}

	const dp = totalGPUs / modelParallelSize
	const fsdpGroupSize =
	parallelism.fsdpShardGroupSize > 1 ? parallelism.fsdpShardGroupSize : modelParallelSize

	if (fsdpGroupSize % modelParallelSize !== 0 \|\| totalGPUs % fsdpGroupSize !== 0) {
	return null
	}

	return {
	modelParallelSize,
	dp,
	replicaGroups: totalGPUs / fsdpGroupSize,
	fsdpGroupSize,
	fsdpDataParallelDegree: fsdpGroupSize / modelParallelSize,
	}
	}

	const getMaxBandwidthForCollective = (
	members: PlacementEntry[],
	cluster: ClusterConfig,
	) => {
	if (members.length <= 1) {
	return {
	bandwidthGBs: cluster.intraNodeBandwidthGBs,
	usesInterNode: false,
	}
	}

	const nodeSet = new Set(members.map((member) => member.nodeIndex))
	const usesInterNode = nodeSet.size > 1

	return {
	bandwidthGBs: usesInterNode
	? cluster.interNodeBandwidthGBs
	: cluster.intraNodeBandwidthGBs,
	usesInterNode,
	}
	}

	const getRingCommStats = ({
	groupCount,
	groupWidth,
	messageBytes,
	collectiveCount,
	membersForBandwidth,
	cluster,
	totalStepTimeMs,
	}: {
	groupCount: number
	groupWidth: number
	messageBytes: number
	collectiveCount: number
	membersForBandwidth: PlacementEntry[]
	cluster: ClusterConfig
	totalStepTimeMs: number
	}): RingCommStats => {
	if (groupWidth <= 1 \|\| collectiveCount <= 0 \|\| messageBytes <= 0) {
	return {
	volumeBytesPerGpu: 0,
	totalVolumeBytes: 0,
	timePerStepMs: 0,
	linkUtilizationPercent: 0,
	usesInterNode: false,
	}
	}

	const ringVolumeBytes = (2 * (groupWidth - 1) * messageBytes) / groupWidth
	const volumeBytesPerGpu = ringVolumeBytes * collectiveCount
	const totalVolumeBytes = volumeBytesPerGpu * groupWidth * groupCount
	const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(
	membersForBandwidth,
	cluster,
	)
	const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
	const linkUtilizationPercent =
	totalStepTimeMs > 0
	? clamp(
	(bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (totalStepTimeMs / 1000))) * 100,
	0,
	100,
	)
	: 0

	return {
	volumeBytesPerGpu,
	totalVolumeBytes,
	timePerStepMs,
	linkUtilizationPercent,
	usesInterNode,
	}
	}

	export function analyzeCluster(
	model: ModelConfig,
	training: TrainingConfig,
	cluster: ClusterConfig,
	parallelism: ParallelismConfig,
	): ClusterAnalysis {
	const totalGPUs = cluster.gpusPerNode * cluster.numNodes
	const derivedParallelism = getDerivedParallelism(cluster, parallelism)
	const globalBatchSizeTokens =
	training.microBatchSize *
	training.seqLength *
	training.gradAccumSteps *
	(derivedParallelism?.dp ?? 0)

	const emptyGpuMap = Array.from({ length: totalGPUs }, (_, globalGPUIndex) => ({
	globalGPUIndex,
	nodeIndex: Math.floor(globalGPUIndex / cluster.gpusPerNode),
	localGPUIndex: globalGPUIndex % cluster.gpusPerNode,
	tpGroup: -1,
	tpLane: -1,
	ppStage: -1,
	cpShard: -1,
	epLane: -1,
	dpReplica: -1,
	replicaGroup: -1,
	fsdpRank: -1,
	memoryUsedGB: 0,
	memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
	isActive: false,
	}))

	const emptyAnalysis = (): ClusterAnalysis => ({
	feasible: false,
	infeasibilityReason: 'Invalid configuration',
	totalParams: 0,
	activeParamsPerToken: 0,
	globalBatchSizeTokens,
	totalGPUs,
	derivedParallelism: {
	dp: derivedParallelism?.dp ?? 0,
	replicaGroups: derivedParallelism?.replicaGroups ?? 0,
	fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
	fsdpGroupSize: derivedParallelism?.fsdpGroupSize ?? 0,
	ep: parallelism.ep,
	},
	memoryBreakdown: {
	parametersGB: 0,
	optimizerStatesGB: 0,
	gradientsGB: 0,
	activationsGB: 0,
	totalGB: 0,
	hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
	utilizationPercent: 0,
	},
	pipelineStages: [],
	communication: {
	tp: {
	allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
	messageSizeBytes: 0,
	totalVolumePerStepGB: 0,
	timePerStepMs: 0,
	linkUtilizationPercent: 0,
	},
	pp: {
	activationMessageSizeBytes: 0,
	numP2PTransfersPerStep: 0,
	totalVolumePerStepGB: 0,
	timePerStepMs: 0,
	usesInterNode: false,
	},
	cp: {
	collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
	messageSizeBytes: 0,
	totalVolumePerStepGB: 0,
	timePerStepMs: 0,
	linkUtilizationPercent: 0,
	usesInterNode: false,
	},
	fsdp: {
	collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
	messageSizeBytes: 0,
	totalVolumePerStepGB: 0,
	timePerStepMs: 0,
	linkUtilizationPercent: 0,
	usesInterNode: false,
	},
	ep: {
	allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
	messageSizeBytes: 0,
	totalVolumePerStepGB: 0,
	timePerStepMs: 0,
	linkUtilizationPercent: 0,
	usesInterNode: false,
	},
	dp: {
	gradientVolumePerGPU_GB: 0,
	allReduceTimeMs: 0,
	canOverlapWithBackward: false,
	linkUtilizationPercent: 0,
	},
	},
	throughput: {
	computeTimePerStepMs: 0,
	communicationTimePerStepMs: 0,
	pipelineBubbleFraction: 0,
	pipelineBubbleTimeMs: 0,
	totalStepTimeMs: 0,
	tokensPerSecond: 0,
	mfu: 0,
	},
	gpuMap: emptyGpuMap,
	links: [],
	})

	if (
	training.microBatchSize <= 0 \|\|
	training.seqLength <= 0 \|\|
	training.gradAccumSteps <= 0 \|\|
	parallelism.tp <= 0 \|\|
	parallelism.pp <= 0 \|\|
	parallelism.cp <= 0 \|\|
	parallelism.ep <= 0
	) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason = 'Batch sizes and parallelism degrees must all be positive.'
	return analysis
	}

	if (parallelism.tp * parallelism.ep > cluster.gpusPerNode) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`TP × EP requires ${parallelism.tp * parallelism.ep} GPUs per node, but nodes only have ${cluster.gpusPerNode}.`
	return analysis
	}

	if (!derivedParallelism) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`World size ${totalGPUs} must be divisible by TP × PP × CP × EP, and the FSDP shard group must divide the cluster cleanly.`
	return analysis
	}

	if (model.hiddenDim % model.numHeads !== 0) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`hiddenDim ${model.hiddenDim} must divide evenly across ${model.numHeads} attention heads.`
	return analysis
	}

	if (model.numHeads % parallelism.tp !== 0) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`TP ${parallelism.tp} must divide the ${model.numHeads} attention heads.`
	return analysis
	}

	if (model.numKVHeads % parallelism.tp !== 0) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`TP ${parallelism.tp} should divide the ${model.numKVHeads} KV heads for clean GQA sharding.`
	return analysis
	}

	if (training.seqLength % parallelism.cp !== 0) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`CP ${parallelism.cp} must divide the sequence length ${training.seqLength}.`
	return analysis
	}

	if (model.architecture === 'moe' && !model.moe) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason = 'MoE models require expert metadata.'
	return analysis
	}

	if (model.architecture === 'moe' && model.moe && model.moe.numExperts % parallelism.ep !== 0) {
	const analysis = emptyAnalysis()
	analysis.infeasibilityReason =
	`EP ${parallelism.ep} must divide the ${model.moe.numExperts} experts.`
	return analysis
	}

	const { modelBreakdown, layerDistribution, stageMemory, stageParameters } = getStageMemoryMap(
	model,
	training,
	parallelism,
	derivedParallelism,
	)
	const placement = buildPlacement(cluster, parallelism, derivedParallelism, totalGPUs)
	const maxStageLayers = Math.max(...layerDistribution.map((stage) => stage.numLayers), 0)

	const pipelineStages = layerDistribution.map((stage) => {
	const stageMemoryBreakdown = stageMemory.get(stage.stageIndex)
	const stageParameterCount = stageParameters.get(stage.stageIndex)

	return {
	stageIndex: stage.stageIndex,
	layerRange: [stage.startLayer, stage.endLayer] as [number, number],
	numLayers: stage.numLayers,
	memoryGB: round2(
	(stageMemoryBreakdown?.totalGB ?? 0) *
	parallelism.tp *
	parallelism.cp *
	parallelism.ep *
	derivedParallelism.dp,
	),
	hasEmbedding: stageParameterCount?.hasEmbedding ?? false,
	hasOutputHead: stageParameterCount?.hasOutputHead ?? false,
	}
	})

	const worstStageIndex = pipelineStages.reduce((worstIndex, stage) => {
	const worstStageMemory = stageMemory.get(worstIndex)?.totalGB ?? 0
	const candidateStageMemory = stageMemory.get(stage.stageIndex)?.totalGB ?? 0
	return candidateStageMemory > worstStageMemory ? stage.stageIndex : worstIndex
	}, 0)

	const worstStageMemory = stageMemory.get(worstStageIndex) ?? {
	parametersGB: 0,
	optimizerStatesGB: 0,
	gradientsGB: 0,
	activationsGB: 0,
	totalGB: 0,
	}

	const pipelineBubbleFraction =
	parallelism.pp <= 1
	? 0
	: (parallelism.pp - 1) / (training.gradAccumSteps + parallelism.pp - 1)
	const boundaryStageCount = Math.min(
	parallelism.pp,
	Math.max(0, Math.round(pipelineBubbleFraction * parallelism.pp)),
	)

	const gpuMap = placement.map((entry) => {
	const stageMemoryBreakdown =
	entry.ppStage >= 0
	? stageMemory.get(entry.ppStage) ?? {
	parametersGB: 0,
	optimizerStatesGB: 0,
	gradientsGB: 0,
	activationsGB: 0,
	totalGB: 0,
	}
	: {
	parametersGB: 0,
	optimizerStatesGB: 0,
	gradientsGB: 0,
	activationsGB: 0,
	totalGB: 0,
	}
	const bubbleIdle = entry.ppStage >= parallelism.pp - boundaryStageCount && entry.ppStage >= 0

	return {
	globalGPUIndex: entry.globalGPUIndex,
	nodeIndex: entry.nodeIndex,
	localGPUIndex: entry.localGPUIndex,
	tpGroup: entry.tpGroup,
	tpLane: entry.tpLane,
	ppStage: entry.ppStage,
	cpShard: entry.cpShard,
	epLane: entry.epLane,
	dpReplica: entry.dpReplica,
	replicaGroup: entry.replicaGroup,
	fsdpRank: entry.fsdpRank,
	memoryUsedGB: round2(entry.isActive ? stageMemoryBreakdown.totalGB : 0),
	memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
	isActive: entry.isActive && !bubbleIdle,
	}
	})

	const activationBytes = getActivationBytes(training.precision)
	const shardedSequenceLength = training.seqLength / parallelism.cp
	const tokensPerMicroBatchShard = training.microBatchSize * shardedSequenceLength
	const collectiveMessageBytes =
	tokensPerMicroBatchShard * model.hiddenDim * activationBytes

	const attentionComputeMultiplier = 0.65 + 0.35 * getAttentionMultiplier(model, training.seqLength)
	const activationCheckpointComputeMultiplier = training.activationCheckpointing ? 1.2 : 1
	const totalFlopsPerStep =
	6 *
	modelBreakdown.activeParamsPerToken *
	training.microBatchSize *
	training.seqLength *
	training.gradAccumSteps *
	derivedParallelism.dp *
	attentionComputeMultiplier *
	activationCheckpointComputeMultiplier
	const launchedGPUs = Math.max(totalGPUs, 1)
	const flopsPerGpuPerStep = totalFlopsPerStep / launchedGPUs
	const peakTFLOPs = getPeakTFLOPsForPrecision(cluster.gpuType, training.precision)
	const sustainedTFLOPs = peakTFLOPs * getSustainedComputeEfficiency(training)
	const computeTimePerStepMs = (flopsPerGpuPerStep / (sustainedTFLOPs * 1e12)) * 1000
	const pipelineBubbleTimeMs =
	pipelineBubbleFraction >= 1
	? 0
	: (computeTimePerStepMs * pipelineBubbleFraction) / (1 - pipelineBubbleFraction)

	const tentativeTotalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs

	const tpMembers = placement.filter(
	(entry) =>
	entry.dpReplica === 0 &&
	entry.ppStage === 0 &&
	entry.cpShard === 0 &&
	entry.epLane === 0 &&
	entry.tpLane >= 0,
	)
	const tpStats = getRingCommStats({
	groupCount: parallelism.pp * parallelism.cp * parallelism.ep * derivedParallelism.dp,
	groupWidth: parallelism.tp,
	messageBytes: collectiveMessageBytes,
	collectiveCount: TP_ALL_REDUCES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
	membersForBandwidth: tpMembers,
	cluster,
	totalStepTimeMs: tentativeTotalStepTimeMs,
	})

	const cpMembers = placement.filter(
	(entry) =>
	entry.dpReplica === 0 &&
	entry.ppStage === 0 &&
	entry.epLane === 0 &&
	entry.tpLane === 0 &&
	entry.cpShard >= 0,
	)
	const cpStats = getRingCommStats({
	groupCount: parallelism.pp * derivedParallelism.dp * parallelism.tp * parallelism.ep,
	groupWidth: parallelism.cp,
	messageBytes: collectiveMessageBytes,
	collectiveCount: CP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
	membersForBandwidth: cpMembers,
	cluster,
	totalStepTimeMs: tentativeTotalStepTimeMs,
	})

	const averageSharedLayerParams =
	model.numLayers > 0
	? (modelBreakdown.denseLayerCount * modelBreakdown.sharedDenseLayerParams +
	modelBreakdown.moeLayerCount * modelBreakdown.sharedMoeLayerParams) /
	model.numLayers
	: 0
	const fsdpMessageBytes =
	parallelism.zeroStage >= 3 && derivedParallelism.fsdpDataParallelDegree > 1
	? (averageSharedLayerParams / parallelism.tp / derivedParallelism.fsdpDataParallelDegree) *
	getParameterBytes(training.precision)
	: 0
	const fsdpMembers = placement.filter(
	(entry) =>
	entry.replicaGroup === 0 &&
	entry.ppStage === 0 &&
	entry.cpShard === 0 &&
	entry.epLane === 0 &&
	entry.tpLane === 0,
	)
	const fsdpStats = getRingCommStats({
	groupCount:
	derivedParallelism.replicaGroups *
	parallelism.pp *
	parallelism.cp *
	parallelism.ep *
	parallelism.tp,
	groupWidth: derivedParallelism.fsdpDataParallelDegree,
	messageBytes: fsdpMessageBytes,
	collectiveCount: FSDP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
	membersForBandwidth: fsdpMembers,
	cluster,
	totalStepTimeMs: tentativeTotalStepTimeMs,
	})

	const epMembers = placement.filter(
	(entry) =>
	entry.dpReplica === 0 &&
	entry.ppStage === 0 &&
	entry.cpShard === 0 &&
	entry.tpLane === 0 &&
	entry.epLane >= 0,
	)
	const moeLayerCount = modelBreakdown.moeLayerCount
	const epMessageBytes =
	model.architecture === 'moe' && model.moe
	? tokensPerMicroBatchShard *
	model.hiddenDim *
	activationBytes *
	model.moe.expertsPerToken
	: 0
	const epTransferCount = EP_ALL_TO_ALLS_PER_LAYER * moeLayerCount * training.gradAccumSteps
	const epStats = (() => {
	if (parallelism.ep <= 1 \|\| epTransferCount <= 0 \|\| epMessageBytes <= 0) {
	return {
	totalVolumeBytes: 0,
	timePerStepMs: 0,
	linkUtilizationPercent: 0,
	usesInterNode: false,
	}
	}

	const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(epMembers, cluster)
	const volumeBytesPerGpu = epMessageBytes * epTransferCount * 2
	const totalVolumeBytes =
	volumeBytesPerGpu *
	parallelism.ep *
	parallelism.pp *
	parallelism.cp *
	parallelism.tp *
	derivedParallelism.dp
	const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
	const linkUtilizationPercent =
	tentativeTotalStepTimeMs > 0
	? clamp(
	(bytesToGB(volumeBytesPerGpu) /
	(bandwidthGBs * (tentativeTotalStepTimeMs / 1000))) *
	100,
	0,
	100,
	)
	: 0

	return {
	totalVolumeBytes,
	timePerStepMs,
	linkUtilizationPercent,
	usesInterNode,
	}
	})()

	let ppTotalVolumeBytes = 0
	let ppTimePerStepMs = 0
	let ppUsesInterNode = false

	for (let dpReplica = 0; dpReplica < derivedParallelism.dp; dpReplica += 1) {
	for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
	for (let stageIndex = 0; stageIndex < parallelism.pp - 1; stageIndex += 1) {
	const source = getPlacementEntry(placement, {
	dpReplica,
	ppStage: stageIndex,
	cpShard,
	epLane: 0,
	tpLane: 0,
	})
	const target = getPlacementEntry(placement, {
	dpReplica,
	ppStage: stageIndex + 1,
	cpShard,
	epLane: 0,
	tpLane: 0,
	})

	if (!source \|\| !target) {
	continue
	}

	const usesInterNode = source.nodeIndex !== target.nodeIndex
	const bandwidthGBs = usesInterNode
	? cluster.interNodeBandwidthGBs
	: cluster.intraNodeBandwidthGBs
	const perLaneBytes = collectiveMessageBytes / parallelism.tp

	ppUsesInterNode \|\|= usesInterNode
	ppTotalVolumeBytes += collectiveMessageBytes * 2 * training.gradAccumSteps
	ppTimePerStepMs +=
	(bytesToGB(perLaneBytes) / bandwidthGBs) * 1000 * 2 * training.gradAccumSteps
	}
	}
	}

	const maxStageGradientBytes = Math.max(
	...Array.from(stageMemory.values()).map((stage) => stage.gradientsGB * BYTES_PER_GB),
	0,
	)
	const dpGroupWidth =
	parallelism.fsdpShardGroupSize > 1
	? derivedParallelism.replicaGroups
	: derivedParallelism.dp
	const dpMembers = parallelism.fsdpShardGroupSize > 1
	? placement.filter(
	(entry) =>
	entry.fsdpRank === 0 &&
	entry.ppStage === 0 &&
	entry.cpShard === 0 &&
	entry.epLane === 0 &&
	entry.tpLane === 0,
	)
	: placement.filter(
	(entry) =>
	entry.ppStage === 0 &&
	entry.cpShard === 0 &&
	entry.epLane === 0 &&
	entry.tpLane === 0,
	)
	const gradientCommBytesPerGpu =
	dpGroupWidth > 1
	? (2 * (dpGroupWidth - 1) * maxStageGradientBytes) / dpGroupWidth
	: 0
	const dpBandwidth = getMaxBandwidthForCollective(dpMembers, cluster)
	const dpTimeMs =
	dpGroupWidth > 1
	? (bytesToGB(gradientCommBytesPerGpu) / dpBandwidth.bandwidthGBs) * 1000
	: 0
	const canOverlapDp = dpGroupWidth > 1 && (parallelism.pp > 1 \|\| training.gradAccumSteps > 1)
	const dpNonOverlappedTimeMs = dpTimeMs * (canOverlapDp ? 0.35 : 1)

	const communicationTimePerStepMs =
	tpStats.timePerStepMs +
	cpStats.timePerStepMs +
	fsdpStats.timePerStepMs +
	epStats.timePerStepMs +
	ppTimePerStepMs +
	dpNonOverlappedTimeMs
	const totalStepTimeMs =
	computeTimePerStepMs + pipelineBubbleTimeMs + communicationTimePerStepMs
	const tokensPerSecond =
	totalStepTimeMs > 0 ? globalBatchSizeTokens / (totalStepTimeMs / 1000) : 0
	const mfu =
	tokensPerSecond > 0
	? clamp(
	(6 * modelBreakdown.activeParamsPerToken * attentionComputeMultiplier * tokensPerSecond) /
	(launchedGPUs * peakTFLOPs * 1e12),
	0,
	1,
	)
	: 0

	const dpLinkUtilizationPercent =
	dpGroupWidth > 1 && totalStepTimeMs > 0
	? clamp(
	(bytesToGB(gradientCommBytesPerGpu) /
	(dpBandwidth.bandwidthGBs * (totalStepTimeMs / 1000))) *
	100,
	0,
	100,
	)
	: 0

	const ppPerLaneVolumeGB =
	parallelism.pp > 1
	? bytesToGB(collectiveMessageBytes / parallelism.tp) * 2 * training.gradAccumSteps
	: 0
	const ppLinkUtilizationPercent =
	parallelism.pp > 1 && totalStepTimeMs > 0
	? clamp(
	(ppPerLaneVolumeGB /
	((ppUsesInterNode
	? cluster.interNodeBandwidthGBs
	: cluster.intraNodeBandwidthGBs) *
	(totalStepTimeMs / 1000))) *
	100,
	0,
	100,
	)
	: 0

	const links: ClusterAnalysis['links'] = []
	const visualReplicaSamples = Math.min(derivedParallelism.dp, 12)
	const sampledDpReplicas = Array.from({ length: visualReplicaSamples }, (_, sampleIndex) =>
	Math.floor((sampleIndex * derivedParallelism.dp) / visualReplicaSamples),
	)

	for (const dpReplica of sampledDpReplicas) {
	for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
	for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
	for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
	const tpEntries = placement
	.filter(
	(entry) =>
	entry.dpReplica === dpReplica &&
	entry.ppStage === ppStage &&
	entry.cpShard === cpShard &&
	entry.epLane === epLane,
	)
	.sort((left, right) => left.tpLane - right.tpLane)

	if (parallelism.tp > 1) {
	for (let lane = 0; lane < tpEntries.length; lane += 1) {
	const from = tpEntries[lane]
	const to = tpEntries[(lane + 1) % tpEntries.length]

	links.push({
	fromGPU: from.globalGPUIndex,
	toGPU: to.globalGPUIndex,
	type: 'nvlink',
	trafficType: 'tp',
	volumeGB: round2(bytesToGB(tpStats.volumeBytesPerGpu)),
	utilizationPercent: round2(tpStats.linkUtilizationPercent),
	})
	}
	}

	if (ppStage < parallelism.pp - 1) {
	const nextTpEntries = placement
	.filter(
	(entry) =>
	entry.dpReplica === dpReplica &&
	entry.ppStage === ppStage + 1 &&
	entry.cpShard === cpShard &&
	entry.epLane === epLane,
	)
	.sort((left, right) => left.tpLane - right.tpLane)

	for (let lane = 0; lane < Math.min(tpEntries.length, nextTpEntries.length); lane += 1) {
	const from = tpEntries[lane]
	const to = nextTpEntries[lane]
	links.push({
	fromGPU: from.globalGPUIndex,
	toGPU: to.globalGPUIndex,
	type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
	trafficType: 'pp',
	volumeGB: round2(ppPerLaneVolumeGB),
	utilizationPercent: round2(ppLinkUtilizationPercent),
	})
	}
	}
	}
	}

	if (parallelism.cp > 1) {
	for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
	for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
	const cpEntries = placement
	.filter(
	(entry) =>
	entry.dpReplica === dpReplica &&
	entry.ppStage === ppStage &&
	entry.epLane === epLane &&
	entry.tpLane === tpLane,
	)
	.sort((left, right) => left.cpShard - right.cpShard)

	for (let shardIndex = 0; shardIndex < cpEntries.length; shardIndex += 1) {
	const from = cpEntries[shardIndex]
	const to = cpEntries[(shardIndex + 1) % cpEntries.length]
	links.push({
	fromGPU: from.globalGPUIndex,
	toGPU: to.globalGPUIndex,
	type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
	trafficType: 'cp',
	volumeGB: round2(bytesToGB(cpStats.volumeBytesPerGpu)),
	utilizationPercent: round2(cpStats.linkUtilizationPercent),
	})
	}
	}
	}
	}

	if (parallelism.ep > 1) {
	for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
	for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
	const epEntries = placement
	.filter(
	(entry) =>
	entry.dpReplica === dpReplica &&
	entry.ppStage === ppStage &&
	entry.cpShard === cpShard &&
	entry.tpLane === tpLane,
	)
	.sort((left, right) => left.epLane - right.epLane)

	for (let lane = 0; lane < epEntries.length; lane += 1) {
	const from = epEntries[lane]
	const to = epEntries[(lane + 1) % epEntries.length]
	links.push({
	fromGPU: from.globalGPUIndex,
	toGPU: to.globalGPUIndex,
	type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
	trafficType: 'ep',
	volumeGB: round2(
	epStats.totalVolumeBytes > 0
	? bytesToGB(epStats.totalVolumeBytes) /
	(parallelism.ep *
	Math.max(parallelism.tp * parallelism.cp * parallelism.pp * derivedParallelism.dp, 1))
	: 0,
	),
	utilizationPercent: round2(epStats.linkUtilizationPercent),
	})
	}
	}
	}
	}

	if (derivedParallelism.fsdpDataParallelDegree > 1) {
	for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
	for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
	for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
	const fsdpEntries = placement
	.filter(
	(entry) =>
	entry.replicaGroup === placement.find((item) => item.dpReplica === dpReplica)?.replicaGroup &&
	entry.ppStage === ppStage &&
	entry.cpShard === cpShard &&
	entry.epLane === epLane &&
	entry.tpLane === tpLane,
	)
	.sort((left, right) => left.fsdpRank - right.fsdpRank)

	for (let rank = 0; rank < fsdpEntries.length; rank += 1) {
	const from = fsdpEntries[rank]
	const to = fsdpEntries[(rank + 1) % fsdpEntries.length]
	links.push({
	fromGPU: from.globalGPUIndex,
	toGPU: to.globalGPUIndex,
	type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
	trafficType: 'fsdp',
	volumeGB: round2(bytesToGB(fsdpStats.volumeBytesPerGpu)),
	utilizationPercent: round2(fsdpStats.linkUtilizationPercent),
	})
	}
	}
	}
	}
	}

	if (dpGroupWidth > 1) {
	for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
	for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
	for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
	const current = placement.find((entry) => entry.dpReplica === dpReplica)
	if (!current) {
	continue
	}

	const from = getPlacementEntry(placement, {
	replicaGroup:
	parallelism.fsdpShardGroupSize > 1 ? current.replicaGroup : undefined,
	fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
	dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : dpReplica,
	ppStage,
	cpShard,
	epLane,
	tpLane,
	})
	const to = getPlacementEntry(placement, {
	replicaGroup:
	parallelism.fsdpShardGroupSize > 1
	? (current.replicaGroup + 1) % derivedParallelism.replicaGroups
	: undefined,
	fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
	dpReplica:
	parallelism.fsdpShardGroupSize > 1
	? undefined
	: (dpReplica + 1) % derivedParallelism.dp,
	ppStage,
	cpShard,
	epLane,
	tpLane,
	})

	if (!from \|\| !to) {
	continue
	}

	links.push({
	fromGPU: from.globalGPUIndex,
	toGPU: to.globalGPUIndex,
	type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
	trafficType: 'dp',
	volumeGB: round2(bytesToGB(gradientCommBytesPerGpu)),
	utilizationPercent: round2(dpLinkUtilizationPercent),
	})
	}
	}
	}
	}
	}
	}

	const feasible = worstStageMemory.totalGB <= cluster.gpuType.hbmCapacityGB
	const infeasibilityReason = feasible
	? undefined
	: `Stage ${worstStageIndex} uses ${round2(worstStageMemory.totalGB)} GB per GPU, exceeding ${cluster.gpuType.hbmCapacityGB} GB of HBM.`

	return {
	feasible,
	infeasibilityReason,
	totalParams: Math.round(modelBreakdown.totalParams),
	activeParamsPerToken: Math.round(modelBreakdown.activeParamsPerToken),
	globalBatchSizeTokens,
	totalGPUs,
	derivedParallelism: {
	dp: derivedParallelism.dp,
	replicaGroups: derivedParallelism.replicaGroups,
	fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
	fsdpGroupSize: derivedParallelism.fsdpGroupSize,
	ep: parallelism.ep,
	},
	memoryBreakdown: {
	parametersGB: round2(worstStageMemory.parametersGB),
	optimizerStatesGB: round2(worstStageMemory.optimizerStatesGB),
	gradientsGB: round2(worstStageMemory.gradientsGB),
	activationsGB: round2(worstStageMemory.activationsGB),
	totalGB: round2(worstStageMemory.totalGB),
	hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
	utilizationPercent: round2(
	(worstStageMemory.totalGB / cluster.gpuType.hbmCapacityGB) * 100,
	),
	},
	pipelineStages,
	communication: {
	tp: {
	allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
	messageSizeBytes: collectiveMessageBytes,
	totalVolumePerStepGB: round2(bytesToGB(tpStats.totalVolumeBytes)),
	timePerStepMs: round2(tpStats.timePerStepMs),
	linkUtilizationPercent: round2(tpStats.linkUtilizationPercent),
	},
	pp: {
	activationMessageSizeBytes: collectiveMessageBytes,
	numP2PTransfersPerStep:
	parallelism.pp > 1
	? 2 *
	(parallelism.pp - 1) *
	training.gradAccumSteps *
	parallelism.cp *
	parallelism.tp *
	derivedParallelism.dp
	: 0,
	totalVolumePerStepGB: round2(bytesToGB(ppTotalVolumeBytes)),
	timePerStepMs: round2(ppTimePerStepMs),
	usesInterNode: ppUsesInterNode,
	},
	cp: {
	collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
	messageSizeBytes: collectiveMessageBytes,
	totalVolumePerStepGB: round2(bytesToGB(cpStats.totalVolumeBytes)),
	timePerStepMs: round2(cpStats.timePerStepMs),
	linkUtilizationPercent: round2(cpStats.linkUtilizationPercent),
	usesInterNode: cpStats.usesInterNode,
	},
	fsdp: {
	collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
	messageSizeBytes: round2(fsdpMessageBytes),
	totalVolumePerStepGB: round2(bytesToGB(fsdpStats.totalVolumeBytes)),
	timePerStepMs: round2(fsdpStats.timePerStepMs),
	linkUtilizationPercent: round2(fsdpStats.linkUtilizationPercent),
	usesInterNode: fsdpStats.usesInterNode,
	},
	ep: {
	allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
	messageSizeBytes: round2(epMessageBytes),
	totalVolumePerStepGB: round2(bytesToGB(epStats.totalVolumeBytes)),
	timePerStepMs: round2(epStats.timePerStepMs),
	linkUtilizationPercent: round2(epStats.linkUtilizationPercent),
	usesInterNode: epStats.usesInterNode,
	},
	dp: {
	gradientVolumePerGPU_GB: round2(bytesToGB(gradientCommBytesPerGpu)),
	allReduceTimeMs: round2(dpTimeMs),
	canOverlapWithBackward: canOverlapDp,
	linkUtilizationPercent: round2(dpLinkUtilizationPercent),
	},
	},
	throughput: {
	computeTimePerStepMs: round2(computeTimePerStepMs),
	communicationTimePerStepMs: round2(communicationTimePerStepMs),
	pipelineBubbleFraction: round2(pipelineBubbleFraction),
	pipelineBubbleTimeMs: round2(pipelineBubbleTimeMs),
	totalStepTimeMs: round2(totalStepTimeMs),
	tokensPerSecond: round2(tokensPerSecond),
	mfu: round2(mfu),
	},
	gpuMap,
	links,
	}
	}

	export const llama7B = (): ModelConfig => ({
	architecture: 'dense',
	hiddenDim: 4096,
	numLayers: 32,
	numHeads: 32,
	numKVHeads: 32,
	vocabSize: 32000,
	intermediateSize: 11008,
	tiedEmbeddings: false,
	attentionProfile: {
	type: 'full',
	},
	})

	export const llama70B = (): ModelConfig => ({
	architecture: 'dense',
	hiddenDim: 8192,
	numLayers: 80,
	numHeads: 64,
	numKVHeads: 8,
	vocabSize: 32000,
	intermediateSize: 28672,
	tiedEmbeddings: false,
	attentionProfile: {
	type: 'full',
	},
	})

	export const llama405B = (): ModelConfig => ({
	architecture: 'dense',
	hiddenDim: 16384,
	numLayers: 126,
	numHeads: 128,
	numKVHeads: 8,
	vocabSize: 128256,
	intermediateSize: 53248,
	tiedEmbeddings: false,
	attentionProfile: {
	type: 'full',
	},
	})

	export const olmo3_32B = (): ModelConfig => ({
	architecture: 'dense',
	hiddenDim: 5120,
	numLayers: 64,
	numHeads: 40,
	numKVHeads: 8,
	vocabSize: 100278,
	intermediateSize: 27648,
	tiedEmbeddings: false,
	attentionProfile: {
	type: 'hybrid',
	slidingWindowSize: 4096,
	globalAttentionFraction: 0.25,
	},
	})

	export const llama31_405B = (): ModelConfig => ({
	architecture: 'dense',
	hiddenDim: 16384,
	numLayers: 126,
	numHeads: 128,
	numKVHeads: 8,
	vocabSize: 128256,
	intermediateSize: 53248,
	tiedEmbeddings: false,
	attentionProfile: {
	type: 'full',
	},
	})

	export const trinityLarge400B = (): ModelConfig => ({
	architecture: 'moe',
	hiddenDim: 3072,
	numLayers: 60,
	numHeads: 48,
	numKVHeads: 8,
	vocabSize: 200192,
	intermediateSize: 12288,
	tiedEmbeddings: false,
	attentionProfile: {
	type: 'hybrid',
	slidingWindowSize: 4096,
	globalAttentionEveryN: 4,
	},
	moe: {
	numExperts: 256,
	expertsPerToken: 4,
	numDenseLayers: 6,
	expertIntermediateSize: 3072,
	activeParamsPerToken: 13_000_000_000,
	},
	})

	export const a100_80gb = (): GPUSpec => ({
	name: 'A100 80GB',
	hbmCapacityGB: 80,
	peakTFLOPsBF16: 312,
	memBandwidthTBs: 2,
	})

	export const h100_sxm = (): GPUSpec => ({
	name: 'H100 SXM',
	hbmCapacityGB: 80,
	peakTFLOPsBF16: 989,
	memBandwidthTBs: 3.35,
	})

	export const b300 = (): GPUSpec => ({
	name: 'B300',
	hbmCapacityGB: 192,
	peakTFLOPsBF16: 2250,
	memBandwidthTBs: 8,
	})

	export const gb200 = (): GPUSpec => ({
	name: 'GB200',
	hbmCapacityGB: 192,
	peakTFLOPsBF16: 2250,
	memBandwidthTBs: 8,
	})

	export const singleNode8GPU = (gpuType: GPUSpec = a100_80gb()): ClusterConfig => {
	const fabric = getDefaultFabric(gpuType)

	return {
	gpuType,
	gpusPerNode: 8,
	numNodes: 1,
	intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
	interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
	nodesPerRack: 1,
	rackLabel: 'node',
	nodeLabel: 'GPU host',
	podLabel: 'node',
	}
	}

	export const cluster64GPU = (gpuType: GPUSpec = h100_sxm()): ClusterConfig => {
	const fabric = getDefaultFabric(gpuType)

	return {
	gpuType,
	gpusPerNode: 8,
	numNodes: 8,
	intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
	interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
	nodesPerRack: 4,
	rackLabel: 'rack',
	nodeLabel: 'GPU host',
	podLabel: 'rack',
	}
	}

	export const frontier576GPU = (): ClusterConfig => {
	const gpuType = gb200()
	const fabric = getDefaultFabric(gpuType)

	return {
	gpuType,
	gpusPerNode: 8,
	numNodes: 72,
	intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
	interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
	nodesPerRack: 9,
	rackLabel: 'NVL72 rack',
	nodeLabel: 'compute tray',
	podLabel: 'rack',
	}
	}