export interface ModelConfig {
  architecture: 'dense' | 'moe'
  hiddenDim: number
  numLayers: number
  numHeads: number
  numKVHeads: number
  vocabSize: number
  intermediateSize: number
  tiedEmbeddings: boolean
  attentionProfile?: {
    type: 'full' | 'hybrid'
    slidingWindowSize?: number
    globalAttentionFraction?: number
    globalAttentionEveryN?: number
  }
  moe?: {
    numExperts: number
    expertsPerToken: number
    numDenseLayers: number
    expertIntermediateSize: number
    activeParamsPerToken?: number
  }
}

export interface TrainingConfig {
  microBatchSize: number
  seqLength: number
  gradAccumSteps: number
  precision: 'fp32' | 'bf16' | 'fp16' | 'fp8'
  activationCheckpointing: boolean
  optimizer: 'adam' | 'adamw' | 'sgd' | 'muon'
}

export interface GPUSpec {
  name: string
  hbmCapacityGB: number
  peakTFLOPsBF16: number
  memBandwidthTBs: number
}

export interface ClusterConfig {
  gpuType: GPUSpec
  gpusPerNode: number
  numNodes: number
  intraNodeBandwidthGBs: number
  interNodeBandwidthGBs: number
  nodesPerRack?: number
  rackLabel?: string
  nodeLabel?: string
  podLabel?: string
}

export interface ParallelismConfig {
  tp: number
  pp: number
  cp: number
  ep: number
  distributedOptimizer: boolean
  fsdpShardGroupSize: number
  zeroStage: 0 | 1 | 2 | 3
}

export interface ClusterAnalysis {
  feasible: boolean
  infeasibilityReason?: string
  totalParams: number
  activeParamsPerToken: number
  globalBatchSizeTokens: number
  totalGPUs: number
  derivedParallelism: {
    dp: number
    replicaGroups: number
    fsdpShardGroupSize: number
    fsdpGroupSize: number
    ep: number
  }
  memoryBreakdown: {
    parametersGB: number
    optimizerStatesGB: number
    gradientsGB: number
    activationsGB: number
    totalGB: number
    hbmCapacityGB: number
    utilizationPercent: number
  }
  pipelineStages: {
    stageIndex: number
    layerRange: [number, number]
    numLayers: number
    memoryGB: number
    hasEmbedding: boolean
    hasOutputHead: boolean
  }[]
  communication: {
    tp: {
      allReducesPerLayer: number
      messageSizeBytes: number
      totalVolumePerStepGB: number
      timePerStepMs: number
      linkUtilizationPercent: number
    }
    pp: {
      activationMessageSizeBytes: number
      numP2PTransfersPerStep: number
      totalVolumePerStepGB: number
      timePerStepMs: number
      usesInterNode: boolean
    }
    cp: {
      collectivesPerLayer: number
      messageSizeBytes: number
      totalVolumePerStepGB: number
      timePerStepMs: number
      linkUtilizationPercent: number
      usesInterNode: boolean
    }
    fsdp: {
      collectivesPerLayer: number
      messageSizeBytes: number
      totalVolumePerStepGB: number
      timePerStepMs: number
      linkUtilizationPercent: number
      usesInterNode: boolean
    }
    ep: {
      allToAllsPerLayer: number
      messageSizeBytes: number
      totalVolumePerStepGB: number
      timePerStepMs: number
      linkUtilizationPercent: number
      usesInterNode: boolean
    }
    dp: {
      gradientVolumePerGPU_GB: number
      allReduceTimeMs: number
      canOverlapWithBackward: boolean
      linkUtilizationPercent: number
    }
  }
  throughput: {
    computeTimePerStepMs: number
    communicationTimePerStepMs: number
    pipelineBubbleFraction: number
    pipelineBubbleTimeMs: number
    totalStepTimeMs: number
    tokensPerSecond: number
    mfu: number
  }
  gpuMap: {
    globalGPUIndex: number
    nodeIndex: number
    localGPUIndex: number
    tpGroup: number
    tpLane: number
    ppStage: number
    cpShard: number
    epLane: number
    dpReplica: number
    replicaGroup: number
    fsdpRank: number
    memoryUsedGB: number
    memoryCapacityGB: number
    isActive: boolean
  }[]
  links: {
    fromGPU: number
    toGPU: number
    type: 'nvlink' | 'infiniband'
    trafficType: 'tp' | 'pp' | 'cp' | 'fsdp' | 'ep' | 'dp'
    volumeGB: number
    utilizationPercent: number
  }[]
}

type LayerDistribution = {
  stageIndex: number
  startLayer: number
  endLayer: number
  numLayers: number
}

type StageMemory = {
  parametersGB: number
  optimizerStatesGB: number
  gradientsGB: number
  activationsGB: number
  totalGB: number
}

type StageParameterCount = {
  stageParams: number
  sharedParams: number
  expertParams: number
  denseLayers: number
  moeLayers: number
  hasEmbedding: boolean
  hasOutputHead: boolean
}

type PlacementEntry = {
  globalGPUIndex: number
  nodeIndex: number
  localGPUIndex: number
  tpGroup: number
  tpLane: number
  ppStage: number
  cpShard: number
  epLane: number
  dpReplica: number
  replicaGroup: number
  fsdpRank: number
  isActive: boolean
}

type DerivedParallelism = {
  modelParallelSize: number
  dp: number
  replicaGroups: number
  fsdpGroupSize: number
  fsdpDataParallelDegree: number
}

type ModelBreakdown = ReturnType<typeof getModelBreakdown>

type RingCommStats = {
  volumeBytesPerGpu: number
  totalVolumeBytes: number
  timePerStepMs: number
  linkUtilizationPercent: number
  usesInterNode: boolean
}

const BYTES_PER_GB = 1e9
const TP_ALL_REDUCES_PER_LAYER = 4
const CP_COLLECTIVES_PER_LAYER = 2
const FSDP_COLLECTIVES_PER_LAYER = 4
const EP_ALL_TO_ALLS_PER_LAYER = 2
const DEFAULT_BF16_EFFICIENCY = 0.56

const clamp = (value: number, min: number, max: number) =>
  Math.min(Math.max(value, min), max)

const bytesToGB = (bytes: number) => bytes / BYTES_PER_GB

const round2 = (value: number) => Math.round(value * 100) / 100

const getParameterBytes = (precision: TrainingConfig['precision']) => {
  switch (precision) {
    case 'fp32':
      return 4
    case 'fp8':
      return 1
    default:
      return 2
  }
}

const getActivationBytes = (precision: TrainingConfig['precision']) =>
  precision === 'fp32' ? 4 : 2

const getGradientBytes = (precision: TrainingConfig['precision']) =>
  precision === 'fp32' ? 4 : 2

const getOptimizerBytesPerParam = (
  optimizer: TrainingConfig['optimizer'],
  precision: TrainingConfig['precision'],
) => {
  if (optimizer === 'sgd') {
    return 4
  }

  // Muon keeps lower optimizer state than Adam-family optimizers in practice.
  // We model it as 8 bytes per parameter of extra state on top of bf16 weights.
  if (optimizer === 'muon') {
    return 8
  }

  return precision === 'fp32' ? 8 : 12
}

const getPeakTFLOPsForPrecision = (gpu: GPUSpec, precision: TrainingConfig['precision']) => {
  switch (precision) {
    case 'fp32':
      return gpu.peakTFLOPsBF16 * 0.25
    case 'fp8':
      return gpu.peakTFLOPsBF16 * 2
    default:
      return gpu.peakTFLOPsBF16
  }
}

const getSustainedComputeEfficiency = (training: TrainingConfig) => {
  const checkpointPenalty = training.activationCheckpointing ? 0.02 : 0
  const fp32Penalty = training.precision === 'fp32' ? 0.08 : 0
  const moeBoost = training.optimizer === 'muon' ? 0.02 : 0

  return clamp(DEFAULT_BF16_EFFICIENCY - checkpointPenalty - fp32Penalty + moeBoost, 0.3, 0.62)
}

const distributeLayers = (numLayers: number, pp: number): LayerDistribution[] => {
  const baseLayers = Math.floor(numLayers / pp)
  const remainder = numLayers % pp
  let startLayer = 0

  return Array.from({ length: pp }, (_, stageIndex) => {
    const stageLayers = baseLayers + (stageIndex < remainder ? 1 : 0)
    const endLayer = startLayer + stageLayers - 1
    const distribution = {
      stageIndex,
      startLayer,
      endLayer,
      numLayers: stageLayers,
    }

    startLayer += stageLayers
    return distribution
  })
}

const getDefaultFabric = (gpu: GPUSpec) => {
  const normalizedName = gpu.name.toLowerCase()

  if (normalizedName.includes('gb200')) {
    return {
      intraNodeBandwidthGBs: 900,
      interNodeBandwidthGBs: 100,
    }
  }

  if (normalizedName.includes('h100')) {
    return {
      intraNodeBandwidthGBs: 450,
      interNodeBandwidthGBs: 100,
    }
  }

  return {
    intraNodeBandwidthGBs: 300,
    interNodeBandwidthGBs: 50,
  }
}

const getModelBreakdown = (model: ModelConfig) => {
  const headDim = model.hiddenDim / model.numHeads
  const embeddingParams = model.vocabSize * model.hiddenDim
  const kvProjectionDim = model.numKVHeads * headDim

  const perLayerAttentionParams =
    model.hiddenDim * (model.hiddenDim + 2 * kvProjectionDim + model.hiddenDim)
  const perLayerDenseMlpParams = model.hiddenDim * model.intermediateSize * 3
  const perLayerNormParams = model.hiddenDim * 2
  const finalNormParams = model.hiddenDim
  const outputHeadParams = model.tiedEmbeddings ? 0 : embeddingParams
  const perExpertParams =
    model.architecture === 'moe' && model.moe
      ? model.hiddenDim * model.moe.expertIntermediateSize * 3
      : 0
  const totalExpertParamsPerLayer =
    model.architecture === 'moe' && model.moe ? perExpertParams * model.moe.numExperts : 0
  const denseLayerCount =
    model.architecture === 'moe' && model.moe ? model.moe.numDenseLayers : model.numLayers
  const moeLayerCount = model.numLayers - denseLayerCount
  const sharedDenseLayerParams =
    perLayerAttentionParams + perLayerDenseMlpParams + perLayerNormParams
  const sharedMoeLayerParams = perLayerAttentionParams + perLayerNormParams
  const sharedParams =
    embeddingParams +
    denseLayerCount * sharedDenseLayerParams +
    moeLayerCount * sharedMoeLayerParams +
    finalNormParams +
    outputHeadParams
  const totalParams = sharedParams + moeLayerCount * totalExpertParamsPerLayer
  const derivedActiveParams =
    model.architecture === 'moe' && model.moe
      ? embeddingParams +
        denseLayerCount * sharedDenseLayerParams +
        moeLayerCount *
          (sharedMoeLayerParams + model.moe.expertsPerToken * perExpertParams) +
        finalNormParams +
        outputHeadParams
      : totalParams
  const activeParamsPerToken =
    model.architecture === 'moe' && model.moe?.activeParamsPerToken != null
      ? model.moe.activeParamsPerToken
      : derivedActiveParams

  const perLayerTotalParams =
    model.architecture === 'moe'
      ? sharedMoeLayerParams + totalExpertParamsPerLayer
      : sharedDenseLayerParams

  return {
    headDim,
    kvProjectionDim,
    embeddingParams,
    perLayerAttentionParams,
    perLayerDenseMlpParams,
    perLayerNormParams,
    perExpertParams,
    totalExpertParamsPerLayer,
    sharedDenseLayerParams,
    sharedMoeLayerParams,
    denseLayerCount,
    moeLayerCount,
    sharedParams,
    perLayerTotalParams,
    finalNormParams,
    outputHeadParams,
    totalParams,
    activeParamsPerToken,
  }
}

const getConcurrentMicroBatches = (
  training: TrainingConfig,
  parallelism: ParallelismConfig,
) => {
  if (parallelism.pp <= 1) {
    return 1
  }

  return Math.max(1, Math.min(training.gradAccumSteps, parallelism.pp))
}

const getAttentionMultiplier = (model: ModelConfig, seqLength: number) => {
  const profile = model.attentionProfile
  if (!profile || profile.type === 'full') {
    return 1
  }

  const windowMultiplier =
    profile.slidingWindowSize != null
      ? clamp(profile.slidingWindowSize / seqLength, 0, 1)
      : 1
  const globalFraction =
    profile.globalAttentionFraction ??
    (profile.globalAttentionEveryN != null ? 1 / profile.globalAttentionEveryN : 0.25)

  return clamp(globalFraction + (1 - globalFraction) * windowMultiplier, windowMultiplier, 1)
}

const getStageLayerMix = (stage: LayerDistribution, model: ModelConfig) => {
  if (model.architecture !== 'moe' || !model.moe) {
    return {
      denseLayers: stage.numLayers,
      moeLayers: 0,
    }
  }

  const denseEnd = model.moe.numDenseLayers - 1
  const denseLayers =
    denseEnd < stage.startLayer
      ? 0
      : Math.max(0, Math.min(stage.endLayer, denseEnd) - stage.startLayer + 1)

  return {
    denseLayers,
    moeLayers: stage.numLayers - denseLayers,
  }
}

const getStageParameterCount = (
  stage: LayerDistribution,
  modelBreakdown: ModelBreakdown,
  parallelism: ParallelismConfig,
  model: ModelConfig,
): StageParameterCount => {
  const layerMix = getStageLayerMix(stage, model)
  let sharedParams =
    layerMix.denseLayers * modelBreakdown.sharedDenseLayerParams +
    layerMix.moeLayers * modelBreakdown.sharedMoeLayerParams
  const expertParams = layerMix.moeLayers * modelBreakdown.totalExpertParamsPerLayer
  const hasEmbedding = stage.stageIndex === 0
  const hasOutputHead = stage.stageIndex === parallelism.pp - 1

  if (hasEmbedding) {
    sharedParams += modelBreakdown.embeddingParams
  }

  if (hasOutputHead) {
    sharedParams += modelBreakdown.finalNormParams + modelBreakdown.outputHeadParams
  }

  return {
    stageParams: sharedParams + expertParams,
    sharedParams,
    expertParams,
    denseLayers: layerMix.denseLayers,
    moeLayers: layerMix.moeLayers,
    hasEmbedding,
    hasOutputHead,
  }
}

const getActivationMemoryBytesPerLayer = ({
  model,
  training,
  parallelism,
  isMoeLayer,
}: {
  model: ModelConfig
  training: TrainingConfig
  parallelism: ParallelismConfig
  isMoeLayer: boolean
}) => {
  const activationBytes = getActivationBytes(training.precision)
  const shardedSequenceLength = training.seqLength / parallelism.cp
  const tokensPerShard = training.microBatchSize * shardedSequenceLength
  const kvHiddenDim = model.numKVHeads * (model.hiddenDim / model.numHeads)
  const tpSequenceShardFactor = parallelism.tp > 1 ? parallelism.tp : 1
  // Sequence parallelism shards the residual stream and checkpointed layer boundaries across
  // the TP group. We assume TP-enabled dense training uses this Megatron-style optimization.
  const hiddenStateBytes =
    (tokensPerShard * model.hiddenDim * activationBytes) / tpSequenceShardFactor
  const attentionMultiplier = getAttentionMultiplier(model, training.seqLength)

  // Sequence-parallel CP reduces the activation footprint by the number of sequence shards.
  const qkvBytes =
    tokensPerShard * (model.hiddenDim + 2 * kvHiddenDim) * activationBytes * attentionMultiplier
  const denseMlpBytes = tokensPerShard * model.intermediateSize * activationBytes * 2
  const moeMlpBytes =
    isMoeLayer && model.moe
      ? (tokensPerShard *
          model.moe.expertIntermediateSize *
          activationBytes *
          model.moe.expertsPerToken *
          2) /
        Math.max(parallelism.ep, 1)
      : 0
  const shardedIntermediateBytes =
    (qkvBytes + (isMoeLayer ? moeMlpBytes : denseMlpBytes)) / Math.max(parallelism.tp, 1)

  if (training.activationCheckpointing) {
    return hiddenStateBytes * 2 + shardedIntermediateBytes * 0.25
  }

  return hiddenStateBytes * 6 + shardedIntermediateBytes * 2
}

const getStageMemory = (
  stageParams: StageParameterCount,
  model: ModelConfig,
  training: TrainingConfig,
  parallelism: ParallelismConfig,
  derivedParallelism: DerivedParallelism,
) => {
  const parameterBytes = getParameterBytes(training.precision)
  const gradientBytes = getGradientBytes(training.precision)
  const optimizerBytes = getOptimizerBytesPerParam(training.optimizer, training.precision)
  const fsdpShardFactor =
    parallelism.fsdpShardGroupSize > 1 ? derivedParallelism.fsdpDataParallelDegree : 1
  const distributedShardFactor = parallelism.distributedOptimizer ? derivedParallelism.dp : 1
  const parameterShardFactor =
    parallelism.zeroStage >= 3 ? fsdpShardFactor : 1
  const optimizerShardFactor =
    parallelism.zeroStage >= 1
      ? parallelism.fsdpShardGroupSize > 1
        ? fsdpShardFactor
        : distributedShardFactor
      : 1
  const gradientShardFactor =
    parallelism.zeroStage >= 2
      ? parallelism.fsdpShardGroupSize > 1
        ? fsdpShardFactor
        : derivedParallelism.dp
      : 1

  const sharedParamsLocal = stageParams.sharedParams / Math.max(parallelism.tp, 1)
  const expertParamsLocal =
    stageParams.expertParams / Math.max(parallelism.tp * parallelism.ep, 1)
  const parameterMemoryBytes =
    (sharedParamsLocal / parameterShardFactor + expertParamsLocal / parameterShardFactor) *
    parameterBytes
  const optimizerMemoryBytes =
    (sharedParamsLocal / optimizerShardFactor + expertParamsLocal / optimizerShardFactor) *
    optimizerBytes
  const gradientMemoryBytes =
    (sharedParamsLocal / gradientShardFactor + expertParamsLocal / gradientShardFactor) *
    gradientBytes

  const denseLayerActivationBytes = getActivationMemoryBytesPerLayer({
    model,
    training,
    parallelism,
    isMoeLayer: false,
  })
  const moeLayerActivationBytes = getActivationMemoryBytesPerLayer({
    model,
    training,
    parallelism,
    isMoeLayer: true,
  })
  const concurrentMicroBatches = getConcurrentMicroBatches(training, parallelism)
  let activationMemoryBytes =
    (denseLayerActivationBytes * stageParams.denseLayers +
      moeLayerActivationBytes * stageParams.moeLayers) *
    concurrentMicroBatches

  if (training.activationCheckpointing && stageParams.stageParams > 0) {
    activationMemoryBytes +=
      Math.max(denseLayerActivationBytes, moeLayerActivationBytes) * 1.5
  }

  const totalBytes =
    parameterMemoryBytes + optimizerMemoryBytes + gradientMemoryBytes + activationMemoryBytes

  return {
    parametersGB: bytesToGB(parameterMemoryBytes),
    optimizerStatesGB: bytesToGB(optimizerMemoryBytes),
    gradientsGB: bytesToGB(gradientMemoryBytes),
    activationsGB: bytesToGB(activationMemoryBytes),
    totalGB: bytesToGB(totalBytes),
  }
}

const getStageMemoryMap = (
  model: ModelConfig,
  training: TrainingConfig,
  parallelism: ParallelismConfig,
  derivedParallelism: DerivedParallelism,
) => {
  const modelBreakdown = getModelBreakdown(model)
  const layerDistribution = distributeLayers(model.numLayers, parallelism.pp)
  const stageMemory = new Map<number, StageMemory>()
  const stageParameters = new Map<number, StageParameterCount>()

  for (const stage of layerDistribution) {
    const stageParameterCount = getStageParameterCount(stage, modelBreakdown, parallelism, model)
    stageParameters.set(stage.stageIndex, stageParameterCount)
    stageMemory.set(
      stage.stageIndex,
      getStageMemory(stageParameterCount, model, training, parallelism, derivedParallelism),
    )
  }

  return {
    modelBreakdown,
    layerDistribution,
    stageMemory,
    stageParameters,
  }
}

const buildPlacement = (
  cluster: ClusterConfig,
  parallelism: ParallelismConfig,
  derivedParallelism: DerivedParallelism,
  requiredGPUs: number,
) => {
  const totalGPUs = cluster.gpusPerNode * cluster.numNodes
  const placement: PlacementEntry[] = []
  let nodeIndex = 0
  let localGPUIndex = 0
  let globalGPUIndex = 0

  for (let replicaGroup = 0; replicaGroup < derivedParallelism.replicaGroups; replicaGroup += 1) {
    for (let fsdpRank = 0; fsdpRank < derivedParallelism.fsdpDataParallelDegree; fsdpRank += 1) {
      const dpReplica = replicaGroup * derivedParallelism.fsdpDataParallelDegree + fsdpRank

      for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
        for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
          if (localGPUIndex + parallelism.ep * parallelism.tp > cluster.gpusPerNode) {
            nodeIndex += 1
            localGPUIndex = 0
          }

          for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
            for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
              placement.push({
                globalGPUIndex,
                nodeIndex,
                localGPUIndex,
                tpGroup:
                  (((dpReplica * parallelism.pp + ppStage) * parallelism.cp + cpShard) *
                    parallelism.ep) +
                  epLane,
                tpLane,
                ppStage,
                cpShard,
                epLane,
                dpReplica,
                replicaGroup,
                fsdpRank,
                isActive: globalGPUIndex < requiredGPUs,
              })

              globalGPUIndex += 1
              localGPUIndex += 1
            }
          }
        }
      }
    }
  }

  while (placement.length < totalGPUs) {
    if (localGPUIndex >= cluster.gpusPerNode) {
      nodeIndex += 1
      localGPUIndex = 0
    }

    placement.push({
      globalGPUIndex,
      nodeIndex,
      localGPUIndex,
      tpGroup: -1,
      tpLane: -1,
      ppStage: -1,
      cpShard: -1,
      epLane: -1,
      dpReplica: -1,
      replicaGroup: -1,
      fsdpRank: -1,
      isActive: false,
    })
    globalGPUIndex += 1
    localGPUIndex += 1
  }

  return placement
}

const getPlacementEntry = (
  placement: PlacementEntry[],
  filters: Partial<
    Pick<
      PlacementEntry,
      'dpReplica' | 'replicaGroup' | 'fsdpRank' | 'ppStage' | 'cpShard' | 'epLane' | 'tpLane'
    >
  >,
) =>
  placement.find(
    (entry) =>
      (filters.dpReplica == null || entry.dpReplica === filters.dpReplica) &&
      (filters.replicaGroup == null || entry.replicaGroup === filters.replicaGroup) &&
      (filters.fsdpRank == null || entry.fsdpRank === filters.fsdpRank) &&
      (filters.ppStage == null || entry.ppStage === filters.ppStage) &&
      (filters.cpShard == null || entry.cpShard === filters.cpShard) &&
      (filters.epLane == null || entry.epLane === filters.epLane) &&
      (filters.tpLane == null || entry.tpLane === filters.tpLane),
  )

const getDerivedParallelism = (
  cluster: ClusterConfig,
  parallelism: ParallelismConfig,
): DerivedParallelism | null => {
  const totalGPUs = cluster.gpusPerNode * cluster.numNodes
  const modelParallelSize =
    parallelism.tp * parallelism.pp * parallelism.cp * parallelism.ep

  if (modelParallelSize <= 0 || totalGPUs % modelParallelSize !== 0) {
    return null
  }

  const dp = totalGPUs / modelParallelSize
  const fsdpGroupSize =
    parallelism.fsdpShardGroupSize > 1 ? parallelism.fsdpShardGroupSize : modelParallelSize

  if (fsdpGroupSize % modelParallelSize !== 0 || totalGPUs % fsdpGroupSize !== 0) {
    return null
  }

  return {
    modelParallelSize,
    dp,
    replicaGroups: totalGPUs / fsdpGroupSize,
    fsdpGroupSize,
    fsdpDataParallelDegree: fsdpGroupSize / modelParallelSize,
  }
}

const getMaxBandwidthForCollective = (
  members: PlacementEntry[],
  cluster: ClusterConfig,
) => {
  if (members.length <= 1) {
    return {
      bandwidthGBs: cluster.intraNodeBandwidthGBs,
      usesInterNode: false,
    }
  }

  const nodeSet = new Set(members.map((member) => member.nodeIndex))
  const usesInterNode = nodeSet.size > 1

  return {
    bandwidthGBs: usesInterNode
      ? cluster.interNodeBandwidthGBs
      : cluster.intraNodeBandwidthGBs,
    usesInterNode,
  }
}

const getRingCommStats = ({
  groupCount,
  groupWidth,
  messageBytes,
  collectiveCount,
  membersForBandwidth,
  cluster,
  totalStepTimeMs,
}: {
  groupCount: number
  groupWidth: number
  messageBytes: number
  collectiveCount: number
  membersForBandwidth: PlacementEntry[]
  cluster: ClusterConfig
  totalStepTimeMs: number
}): RingCommStats => {
  if (groupWidth <= 1 || collectiveCount <= 0 || messageBytes <= 0) {
    return {
      volumeBytesPerGpu: 0,
      totalVolumeBytes: 0,
      timePerStepMs: 0,
      linkUtilizationPercent: 0,
      usesInterNode: false,
    }
  }

  const ringVolumeBytes = (2 * (groupWidth - 1) * messageBytes) / groupWidth
  const volumeBytesPerGpu = ringVolumeBytes * collectiveCount
  const totalVolumeBytes = volumeBytesPerGpu * groupWidth * groupCount
  const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(
    membersForBandwidth,
    cluster,
  )
  const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
  const linkUtilizationPercent =
    totalStepTimeMs > 0
      ? clamp(
          (bytesToGB(volumeBytesPerGpu) / (bandwidthGBs * (totalStepTimeMs / 1000))) * 100,
          0,
          100,
        )
      : 0

  return {
    volumeBytesPerGpu,
    totalVolumeBytes,
    timePerStepMs,
    linkUtilizationPercent,
    usesInterNode,
  }
}

export function analyzeCluster(
  model: ModelConfig,
  training: TrainingConfig,
  cluster: ClusterConfig,
  parallelism: ParallelismConfig,
): ClusterAnalysis {
  const totalGPUs = cluster.gpusPerNode * cluster.numNodes
  const derivedParallelism = getDerivedParallelism(cluster, parallelism)
  const globalBatchSizeTokens =
    training.microBatchSize *
    training.seqLength *
    training.gradAccumSteps *
    (derivedParallelism?.dp ?? 0)

  const emptyGpuMap = Array.from({ length: totalGPUs }, (_, globalGPUIndex) => ({
    globalGPUIndex,
    nodeIndex: Math.floor(globalGPUIndex / cluster.gpusPerNode),
    localGPUIndex: globalGPUIndex % cluster.gpusPerNode,
    tpGroup: -1,
    tpLane: -1,
    ppStage: -1,
    cpShard: -1,
    epLane: -1,
    dpReplica: -1,
    replicaGroup: -1,
    fsdpRank: -1,
    memoryUsedGB: 0,
    memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
    isActive: false,
  }))

  const emptyAnalysis = (): ClusterAnalysis => ({
    feasible: false,
    infeasibilityReason: 'Invalid configuration',
    totalParams: 0,
    activeParamsPerToken: 0,
    globalBatchSizeTokens,
    totalGPUs,
    derivedParallelism: {
      dp: derivedParallelism?.dp ?? 0,
      replicaGroups: derivedParallelism?.replicaGroups ?? 0,
      fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
      fsdpGroupSize: derivedParallelism?.fsdpGroupSize ?? 0,
      ep: parallelism.ep,
    },
    memoryBreakdown: {
      parametersGB: 0,
      optimizerStatesGB: 0,
      gradientsGB: 0,
      activationsGB: 0,
      totalGB: 0,
      hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
      utilizationPercent: 0,
    },
    pipelineStages: [],
    communication: {
      tp: {
        allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
        messageSizeBytes: 0,
        totalVolumePerStepGB: 0,
        timePerStepMs: 0,
        linkUtilizationPercent: 0,
      },
      pp: {
        activationMessageSizeBytes: 0,
        numP2PTransfersPerStep: 0,
        totalVolumePerStepGB: 0,
        timePerStepMs: 0,
        usesInterNode: false,
      },
      cp: {
        collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
        messageSizeBytes: 0,
        totalVolumePerStepGB: 0,
        timePerStepMs: 0,
        linkUtilizationPercent: 0,
        usesInterNode: false,
      },
      fsdp: {
        collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
        messageSizeBytes: 0,
        totalVolumePerStepGB: 0,
        timePerStepMs: 0,
        linkUtilizationPercent: 0,
        usesInterNode: false,
      },
      ep: {
        allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
        messageSizeBytes: 0,
        totalVolumePerStepGB: 0,
        timePerStepMs: 0,
        linkUtilizationPercent: 0,
        usesInterNode: false,
      },
      dp: {
        gradientVolumePerGPU_GB: 0,
        allReduceTimeMs: 0,
        canOverlapWithBackward: false,
        linkUtilizationPercent: 0,
      },
    },
    throughput: {
      computeTimePerStepMs: 0,
      communicationTimePerStepMs: 0,
      pipelineBubbleFraction: 0,
      pipelineBubbleTimeMs: 0,
      totalStepTimeMs: 0,
      tokensPerSecond: 0,
      mfu: 0,
    },
    gpuMap: emptyGpuMap,
    links: [],
  })

  if (
    training.microBatchSize <= 0 ||
    training.seqLength <= 0 ||
    training.gradAccumSteps <= 0 ||
    parallelism.tp <= 0 ||
    parallelism.pp <= 0 ||
    parallelism.cp <= 0 ||
    parallelism.ep <= 0
  ) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason = 'Batch sizes and parallelism degrees must all be positive.'
    return analysis
  }

  if (parallelism.tp * parallelism.ep > cluster.gpusPerNode) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `TP × EP requires ${parallelism.tp * parallelism.ep} GPUs per node, but nodes only have ${cluster.gpusPerNode}.`
    return analysis
  }

  if (!derivedParallelism) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `World size ${totalGPUs} must be divisible by TP × PP × CP × EP, and the FSDP shard group must divide the cluster cleanly.`
    return analysis
  }

  if (model.hiddenDim % model.numHeads !== 0) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `hiddenDim ${model.hiddenDim} must divide evenly across ${model.numHeads} attention heads.`
    return analysis
  }

  if (model.numHeads % parallelism.tp !== 0) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `TP ${parallelism.tp} must divide the ${model.numHeads} attention heads.`
    return analysis
  }

  if (model.numKVHeads % parallelism.tp !== 0) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `TP ${parallelism.tp} should divide the ${model.numKVHeads} KV heads for clean GQA sharding.`
    return analysis
  }

  if (training.seqLength % parallelism.cp !== 0) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `CP ${parallelism.cp} must divide the sequence length ${training.seqLength}.`
    return analysis
  }

  if (model.architecture === 'moe' && !model.moe) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason = 'MoE models require expert metadata.'
    return analysis
  }

  if (model.architecture === 'moe' && model.moe && model.moe.numExperts % parallelism.ep !== 0) {
    const analysis = emptyAnalysis()
    analysis.infeasibilityReason =
      `EP ${parallelism.ep} must divide the ${model.moe.numExperts} experts.`
    return analysis
  }

  const { modelBreakdown, layerDistribution, stageMemory, stageParameters } = getStageMemoryMap(
    model,
    training,
    parallelism,
    derivedParallelism,
  )
  const placement = buildPlacement(cluster, parallelism, derivedParallelism, totalGPUs)
  const maxStageLayers = Math.max(...layerDistribution.map((stage) => stage.numLayers), 0)

  const pipelineStages = layerDistribution.map((stage) => {
    const stageMemoryBreakdown = stageMemory.get(stage.stageIndex)
    const stageParameterCount = stageParameters.get(stage.stageIndex)

    return {
      stageIndex: stage.stageIndex,
      layerRange: [stage.startLayer, stage.endLayer] as [number, number],
      numLayers: stage.numLayers,
      memoryGB: round2(
        (stageMemoryBreakdown?.totalGB ?? 0) *
          parallelism.tp *
          parallelism.cp *
          parallelism.ep *
          derivedParallelism.dp,
      ),
      hasEmbedding: stageParameterCount?.hasEmbedding ?? false,
      hasOutputHead: stageParameterCount?.hasOutputHead ?? false,
    }
  })

  const worstStageIndex = pipelineStages.reduce((worstIndex, stage) => {
    const worstStageMemory = stageMemory.get(worstIndex)?.totalGB ?? 0
    const candidateStageMemory = stageMemory.get(stage.stageIndex)?.totalGB ?? 0
    return candidateStageMemory > worstStageMemory ? stage.stageIndex : worstIndex
  }, 0)

  const worstStageMemory = stageMemory.get(worstStageIndex) ?? {
    parametersGB: 0,
    optimizerStatesGB: 0,
    gradientsGB: 0,
    activationsGB: 0,
    totalGB: 0,
  }

  const pipelineBubbleFraction =
    parallelism.pp <= 1
      ? 0
      : (parallelism.pp - 1) / (training.gradAccumSteps + parallelism.pp - 1)
  const boundaryStageCount = Math.min(
    parallelism.pp,
    Math.max(0, Math.round(pipelineBubbleFraction * parallelism.pp)),
  )

  const gpuMap = placement.map((entry) => {
    const stageMemoryBreakdown =
      entry.ppStage >= 0
        ? stageMemory.get(entry.ppStage) ?? {
            parametersGB: 0,
            optimizerStatesGB: 0,
            gradientsGB: 0,
            activationsGB: 0,
            totalGB: 0,
          }
        : {
            parametersGB: 0,
            optimizerStatesGB: 0,
            gradientsGB: 0,
            activationsGB: 0,
            totalGB: 0,
          }
    const bubbleIdle = entry.ppStage >= parallelism.pp - boundaryStageCount && entry.ppStage >= 0

    return {
      globalGPUIndex: entry.globalGPUIndex,
      nodeIndex: entry.nodeIndex,
      localGPUIndex: entry.localGPUIndex,
      tpGroup: entry.tpGroup,
      tpLane: entry.tpLane,
      ppStage: entry.ppStage,
      cpShard: entry.cpShard,
      epLane: entry.epLane,
      dpReplica: entry.dpReplica,
      replicaGroup: entry.replicaGroup,
      fsdpRank: entry.fsdpRank,
      memoryUsedGB: round2(entry.isActive ? stageMemoryBreakdown.totalGB : 0),
      memoryCapacityGB: cluster.gpuType.hbmCapacityGB,
      isActive: entry.isActive && !bubbleIdle,
    }
  })

  const activationBytes = getActivationBytes(training.precision)
  const shardedSequenceLength = training.seqLength / parallelism.cp
  const tokensPerMicroBatchShard = training.microBatchSize * shardedSequenceLength
  const collectiveMessageBytes =
    tokensPerMicroBatchShard * model.hiddenDim * activationBytes

  const attentionComputeMultiplier = 0.65 + 0.35 * getAttentionMultiplier(model, training.seqLength)
  const activationCheckpointComputeMultiplier = training.activationCheckpointing ? 1.2 : 1
  const totalFlopsPerStep =
    6 *
    modelBreakdown.activeParamsPerToken *
    training.microBatchSize *
    training.seqLength *
    training.gradAccumSteps *
    derivedParallelism.dp *
    attentionComputeMultiplier *
    activationCheckpointComputeMultiplier
  const launchedGPUs = Math.max(totalGPUs, 1)
  const flopsPerGpuPerStep = totalFlopsPerStep / launchedGPUs
  const peakTFLOPs = getPeakTFLOPsForPrecision(cluster.gpuType, training.precision)
  const sustainedTFLOPs = peakTFLOPs * getSustainedComputeEfficiency(training)
  const computeTimePerStepMs = (flopsPerGpuPerStep / (sustainedTFLOPs * 1e12)) * 1000
  const pipelineBubbleTimeMs =
    pipelineBubbleFraction >= 1
      ? 0
      : (computeTimePerStepMs * pipelineBubbleFraction) / (1 - pipelineBubbleFraction)

  const tentativeTotalStepTimeMs = computeTimePerStepMs + pipelineBubbleTimeMs

  const tpMembers = placement.filter(
    (entry) =>
      entry.dpReplica === 0 &&
      entry.ppStage === 0 &&
      entry.cpShard === 0 &&
      entry.epLane === 0 &&
      entry.tpLane >= 0,
  )
  const tpStats = getRingCommStats({
    groupCount: parallelism.pp * parallelism.cp * parallelism.ep * derivedParallelism.dp,
    groupWidth: parallelism.tp,
    messageBytes: collectiveMessageBytes,
    collectiveCount: TP_ALL_REDUCES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
    membersForBandwidth: tpMembers,
    cluster,
    totalStepTimeMs: tentativeTotalStepTimeMs,
  })

  const cpMembers = placement.filter(
    (entry) =>
      entry.dpReplica === 0 &&
      entry.ppStage === 0 &&
      entry.epLane === 0 &&
      entry.tpLane === 0 &&
      entry.cpShard >= 0,
  )
  const cpStats = getRingCommStats({
    groupCount: parallelism.pp * derivedParallelism.dp * parallelism.tp * parallelism.ep,
    groupWidth: parallelism.cp,
    messageBytes: collectiveMessageBytes,
    collectiveCount: CP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
    membersForBandwidth: cpMembers,
    cluster,
    totalStepTimeMs: tentativeTotalStepTimeMs,
  })

  const averageSharedLayerParams =
    model.numLayers > 0
      ? (modelBreakdown.denseLayerCount * modelBreakdown.sharedDenseLayerParams +
          modelBreakdown.moeLayerCount * modelBreakdown.sharedMoeLayerParams) /
        model.numLayers
      : 0
  const fsdpMessageBytes =
    parallelism.zeroStage >= 3 && derivedParallelism.fsdpDataParallelDegree > 1
      ? (averageSharedLayerParams / parallelism.tp / derivedParallelism.fsdpDataParallelDegree) *
        getParameterBytes(training.precision)
      : 0
  const fsdpMembers = placement.filter(
    (entry) =>
      entry.replicaGroup === 0 &&
      entry.ppStage === 0 &&
      entry.cpShard === 0 &&
      entry.epLane === 0 &&
      entry.tpLane === 0,
  )
  const fsdpStats = getRingCommStats({
    groupCount:
      derivedParallelism.replicaGroups *
      parallelism.pp *
      parallelism.cp *
      parallelism.ep *
      parallelism.tp,
    groupWidth: derivedParallelism.fsdpDataParallelDegree,
    messageBytes: fsdpMessageBytes,
    collectiveCount: FSDP_COLLECTIVES_PER_LAYER * maxStageLayers * training.gradAccumSteps,
    membersForBandwidth: fsdpMembers,
    cluster,
    totalStepTimeMs: tentativeTotalStepTimeMs,
  })

  const epMembers = placement.filter(
    (entry) =>
      entry.dpReplica === 0 &&
      entry.ppStage === 0 &&
      entry.cpShard === 0 &&
      entry.tpLane === 0 &&
      entry.epLane >= 0,
  )
  const moeLayerCount = modelBreakdown.moeLayerCount
  const epMessageBytes =
    model.architecture === 'moe' && model.moe
      ? tokensPerMicroBatchShard *
        model.hiddenDim *
        activationBytes *
        model.moe.expertsPerToken
      : 0
  const epTransferCount = EP_ALL_TO_ALLS_PER_LAYER * moeLayerCount * training.gradAccumSteps
  const epStats = (() => {
    if (parallelism.ep <= 1 || epTransferCount <= 0 || epMessageBytes <= 0) {
      return {
        totalVolumeBytes: 0,
        timePerStepMs: 0,
        linkUtilizationPercent: 0,
        usesInterNode: false,
      }
    }

    const { bandwidthGBs, usesInterNode } = getMaxBandwidthForCollective(epMembers, cluster)
    const volumeBytesPerGpu = epMessageBytes * epTransferCount * 2
    const totalVolumeBytes =
      volumeBytesPerGpu *
      parallelism.ep *
      parallelism.pp *
      parallelism.cp *
      parallelism.tp *
      derivedParallelism.dp
    const timePerStepMs = (bytesToGB(volumeBytesPerGpu) / bandwidthGBs) * 1000
    const linkUtilizationPercent =
      tentativeTotalStepTimeMs > 0
        ? clamp(
            (bytesToGB(volumeBytesPerGpu) /
              (bandwidthGBs * (tentativeTotalStepTimeMs / 1000))) *
              100,
            0,
            100,
          )
        : 0

    return {
      totalVolumeBytes,
      timePerStepMs,
      linkUtilizationPercent,
      usesInterNode,
    }
  })()

  let ppTotalVolumeBytes = 0
  let ppTimePerStepMs = 0
  let ppUsesInterNode = false

  for (let dpReplica = 0; dpReplica < derivedParallelism.dp; dpReplica += 1) {
    for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
      for (let stageIndex = 0; stageIndex < parallelism.pp - 1; stageIndex += 1) {
        const source = getPlacementEntry(placement, {
          dpReplica,
          ppStage: stageIndex,
          cpShard,
          epLane: 0,
          tpLane: 0,
        })
        const target = getPlacementEntry(placement, {
          dpReplica,
          ppStage: stageIndex + 1,
          cpShard,
          epLane: 0,
          tpLane: 0,
        })

        if (!source || !target) {
          continue
        }

        const usesInterNode = source.nodeIndex !== target.nodeIndex
        const bandwidthGBs = usesInterNode
          ? cluster.interNodeBandwidthGBs
          : cluster.intraNodeBandwidthGBs
        const perLaneBytes = collectiveMessageBytes / parallelism.tp

        ppUsesInterNode ||= usesInterNode
        ppTotalVolumeBytes += collectiveMessageBytes * 2 * training.gradAccumSteps
        ppTimePerStepMs +=
          (bytesToGB(perLaneBytes) / bandwidthGBs) * 1000 * 2 * training.gradAccumSteps
      }
    }
  }

  const maxStageGradientBytes = Math.max(
    ...Array.from(stageMemory.values()).map((stage) => stage.gradientsGB * BYTES_PER_GB),
    0,
  )
  const dpGroupWidth =
    parallelism.fsdpShardGroupSize > 1
      ? derivedParallelism.replicaGroups
      : derivedParallelism.dp
  const dpMembers = parallelism.fsdpShardGroupSize > 1
    ? placement.filter(
        (entry) =>
          entry.fsdpRank === 0 &&
          entry.ppStage === 0 &&
          entry.cpShard === 0 &&
          entry.epLane === 0 &&
          entry.tpLane === 0,
      )
    : placement.filter(
        (entry) =>
          entry.ppStage === 0 &&
          entry.cpShard === 0 &&
          entry.epLane === 0 &&
          entry.tpLane === 0,
      )
  const gradientCommBytesPerGpu =
    dpGroupWidth > 1
      ? (2 * (dpGroupWidth - 1) * maxStageGradientBytes) / dpGroupWidth
      : 0
  const dpBandwidth = getMaxBandwidthForCollective(dpMembers, cluster)
  const dpTimeMs =
    dpGroupWidth > 1
      ? (bytesToGB(gradientCommBytesPerGpu) / dpBandwidth.bandwidthGBs) * 1000
      : 0
  const canOverlapDp = dpGroupWidth > 1 && (parallelism.pp > 1 || training.gradAccumSteps > 1)
  const dpNonOverlappedTimeMs = dpTimeMs * (canOverlapDp ? 0.35 : 1)

  const communicationTimePerStepMs =
    tpStats.timePerStepMs +
    cpStats.timePerStepMs +
    fsdpStats.timePerStepMs +
    epStats.timePerStepMs +
    ppTimePerStepMs +
    dpNonOverlappedTimeMs
  const totalStepTimeMs =
    computeTimePerStepMs + pipelineBubbleTimeMs + communicationTimePerStepMs
  const tokensPerSecond =
    totalStepTimeMs > 0 ? globalBatchSizeTokens / (totalStepTimeMs / 1000) : 0
  const mfu =
    tokensPerSecond > 0
      ? clamp(
          (6 * modelBreakdown.activeParamsPerToken * attentionComputeMultiplier * tokensPerSecond) /
            (launchedGPUs * peakTFLOPs * 1e12),
          0,
          1,
        )
      : 0

  const dpLinkUtilizationPercent =
    dpGroupWidth > 1 && totalStepTimeMs > 0
      ? clamp(
          (bytesToGB(gradientCommBytesPerGpu) /
            (dpBandwidth.bandwidthGBs * (totalStepTimeMs / 1000))) *
            100,
          0,
          100,
        )
      : 0

  const ppPerLaneVolumeGB =
    parallelism.pp > 1
      ? bytesToGB(collectiveMessageBytes / parallelism.tp) * 2 * training.gradAccumSteps
      : 0
  const ppLinkUtilizationPercent =
    parallelism.pp > 1 && totalStepTimeMs > 0
      ? clamp(
          (ppPerLaneVolumeGB /
            ((ppUsesInterNode
              ? cluster.interNodeBandwidthGBs
              : cluster.intraNodeBandwidthGBs) *
              (totalStepTimeMs / 1000))) *
            100,
          0,
          100,
        )
      : 0

  const links: ClusterAnalysis['links'] = []
  const visualReplicaSamples = Math.min(derivedParallelism.dp, 12)
  const sampledDpReplicas = Array.from({ length: visualReplicaSamples }, (_, sampleIndex) =>
    Math.floor((sampleIndex * derivedParallelism.dp) / visualReplicaSamples),
  )

  for (const dpReplica of sampledDpReplicas) {
    for (let ppStage = 0; ppStage < parallelism.pp; ppStage += 1) {
      for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
        for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
          const tpEntries = placement
            .filter(
              (entry) =>
                entry.dpReplica === dpReplica &&
                entry.ppStage === ppStage &&
                entry.cpShard === cpShard &&
                entry.epLane === epLane,
            )
            .sort((left, right) => left.tpLane - right.tpLane)

          if (parallelism.tp > 1) {
            for (let lane = 0; lane < tpEntries.length; lane += 1) {
              const from = tpEntries[lane]
              const to = tpEntries[(lane + 1) % tpEntries.length]

              links.push({
                fromGPU: from.globalGPUIndex,
                toGPU: to.globalGPUIndex,
                type: 'nvlink',
                trafficType: 'tp',
                volumeGB: round2(bytesToGB(tpStats.volumeBytesPerGpu)),
                utilizationPercent: round2(tpStats.linkUtilizationPercent),
              })
            }
          }

          if (ppStage < parallelism.pp - 1) {
            const nextTpEntries = placement
              .filter(
                (entry) =>
                  entry.dpReplica === dpReplica &&
                  entry.ppStage === ppStage + 1 &&
                  entry.cpShard === cpShard &&
                  entry.epLane === epLane,
              )
              .sort((left, right) => left.tpLane - right.tpLane)

            for (let lane = 0; lane < Math.min(tpEntries.length, nextTpEntries.length); lane += 1) {
              const from = tpEntries[lane]
              const to = nextTpEntries[lane]
              links.push({
                fromGPU: from.globalGPUIndex,
                toGPU: to.globalGPUIndex,
                type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
                trafficType: 'pp',
                volumeGB: round2(ppPerLaneVolumeGB),
                utilizationPercent: round2(ppLinkUtilizationPercent),
              })
            }
          }
        }
      }

      if (parallelism.cp > 1) {
        for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
        for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
          const cpEntries = placement
            .filter(
              (entry) =>
                entry.dpReplica === dpReplica &&
                entry.ppStage === ppStage &&
                entry.epLane === epLane &&
                entry.tpLane === tpLane,
            )
            .sort((left, right) => left.cpShard - right.cpShard)

          for (let shardIndex = 0; shardIndex < cpEntries.length; shardIndex += 1) {
            const from = cpEntries[shardIndex]
            const to = cpEntries[(shardIndex + 1) % cpEntries.length]
            links.push({
              fromGPU: from.globalGPUIndex,
              toGPU: to.globalGPUIndex,
              type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
              trafficType: 'cp',
              volumeGB: round2(bytesToGB(cpStats.volumeBytesPerGpu)),
              utilizationPercent: round2(cpStats.linkUtilizationPercent),
            })
          }
        }
        }
      }

      if (parallelism.ep > 1) {
        for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
          for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
            const epEntries = placement
              .filter(
                (entry) =>
                  entry.dpReplica === dpReplica &&
                  entry.ppStage === ppStage &&
                  entry.cpShard === cpShard &&
                  entry.tpLane === tpLane,
              )
              .sort((left, right) => left.epLane - right.epLane)

            for (let lane = 0; lane < epEntries.length; lane += 1) {
              const from = epEntries[lane]
              const to = epEntries[(lane + 1) % epEntries.length]
              links.push({
                fromGPU: from.globalGPUIndex,
                toGPU: to.globalGPUIndex,
                type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
                trafficType: 'ep',
                volumeGB: round2(
                  epStats.totalVolumeBytes > 0
                    ? bytesToGB(epStats.totalVolumeBytes) /
                      (parallelism.ep *
                        Math.max(parallelism.tp * parallelism.cp * parallelism.pp * derivedParallelism.dp, 1))
                    : 0,
                ),
                utilizationPercent: round2(epStats.linkUtilizationPercent),
              })
            }
          }
        }
      }

      if (derivedParallelism.fsdpDataParallelDegree > 1) {
        for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
          for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
            for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
              const fsdpEntries = placement
                .filter(
                  (entry) =>
                    entry.replicaGroup === placement.find((item) => item.dpReplica === dpReplica)?.replicaGroup &&
                    entry.ppStage === ppStage &&
                    entry.cpShard === cpShard &&
                    entry.epLane === epLane &&
                    entry.tpLane === tpLane,
                )
                .sort((left, right) => left.fsdpRank - right.fsdpRank)

              for (let rank = 0; rank < fsdpEntries.length; rank += 1) {
                const from = fsdpEntries[rank]
                const to = fsdpEntries[(rank + 1) % fsdpEntries.length]
                links.push({
                  fromGPU: from.globalGPUIndex,
                  toGPU: to.globalGPUIndex,
                  type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
                  trafficType: 'fsdp',
                  volumeGB: round2(bytesToGB(fsdpStats.volumeBytesPerGpu)),
                  utilizationPercent: round2(fsdpStats.linkUtilizationPercent),
                })
              }
            }
          }
        }
      }

      if (dpGroupWidth > 1) {
        for (let cpShard = 0; cpShard < parallelism.cp; cpShard += 1) {
          for (let epLane = 0; epLane < parallelism.ep; epLane += 1) {
            for (let tpLane = 0; tpLane < parallelism.tp; tpLane += 1) {
              const current = placement.find((entry) => entry.dpReplica === dpReplica)
              if (!current) {
                continue
              }

              const from = getPlacementEntry(placement, {
                replicaGroup:
                  parallelism.fsdpShardGroupSize > 1 ? current.replicaGroup : undefined,
                fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
                dpReplica: parallelism.fsdpShardGroupSize > 1 ? undefined : dpReplica,
                ppStage,
                cpShard,
                epLane,
                tpLane,
              })
              const to = getPlacementEntry(placement, {
                replicaGroup:
                  parallelism.fsdpShardGroupSize > 1
                    ? (current.replicaGroup + 1) % derivedParallelism.replicaGroups
                    : undefined,
                fsdpRank: parallelism.fsdpShardGroupSize > 1 ? current.fsdpRank : undefined,
                dpReplica:
                  parallelism.fsdpShardGroupSize > 1
                    ? undefined
                    : (dpReplica + 1) % derivedParallelism.dp,
                ppStage,
                cpShard,
                epLane,
                tpLane,
              })

              if (!from || !to) {
                continue
              }

              links.push({
                fromGPU: from.globalGPUIndex,
                toGPU: to.globalGPUIndex,
                type: from.nodeIndex === to.nodeIndex ? 'nvlink' : 'infiniband',
                trafficType: 'dp',
                volumeGB: round2(bytesToGB(gradientCommBytesPerGpu)),
                utilizationPercent: round2(dpLinkUtilizationPercent),
              })
            }
          }
        }
      }
    }
  }

  const feasible = worstStageMemory.totalGB <= cluster.gpuType.hbmCapacityGB
  const infeasibilityReason = feasible
    ? undefined
    : `Stage ${worstStageIndex} uses ${round2(worstStageMemory.totalGB)} GB per GPU, exceeding ${cluster.gpuType.hbmCapacityGB} GB of HBM.`

  return {
    feasible,
    infeasibilityReason,
    totalParams: Math.round(modelBreakdown.totalParams),
    activeParamsPerToken: Math.round(modelBreakdown.activeParamsPerToken),
    globalBatchSizeTokens,
    totalGPUs,
    derivedParallelism: {
      dp: derivedParallelism.dp,
      replicaGroups: derivedParallelism.replicaGroups,
      fsdpShardGroupSize: parallelism.fsdpShardGroupSize,
      fsdpGroupSize: derivedParallelism.fsdpGroupSize,
      ep: parallelism.ep,
    },
    memoryBreakdown: {
      parametersGB: round2(worstStageMemory.parametersGB),
      optimizerStatesGB: round2(worstStageMemory.optimizerStatesGB),
      gradientsGB: round2(worstStageMemory.gradientsGB),
      activationsGB: round2(worstStageMemory.activationsGB),
      totalGB: round2(worstStageMemory.totalGB),
      hbmCapacityGB: cluster.gpuType.hbmCapacityGB,
      utilizationPercent: round2(
        (worstStageMemory.totalGB / cluster.gpuType.hbmCapacityGB) * 100,
      ),
    },
    pipelineStages,
    communication: {
      tp: {
        allReducesPerLayer: TP_ALL_REDUCES_PER_LAYER,
        messageSizeBytes: collectiveMessageBytes,
        totalVolumePerStepGB: round2(bytesToGB(tpStats.totalVolumeBytes)),
        timePerStepMs: round2(tpStats.timePerStepMs),
        linkUtilizationPercent: round2(tpStats.linkUtilizationPercent),
      },
      pp: {
        activationMessageSizeBytes: collectiveMessageBytes,
        numP2PTransfersPerStep:
          parallelism.pp > 1
            ? 2 *
              (parallelism.pp - 1) *
              training.gradAccumSteps *
              parallelism.cp *
              parallelism.tp *
              derivedParallelism.dp
            : 0,
        totalVolumePerStepGB: round2(bytesToGB(ppTotalVolumeBytes)),
        timePerStepMs: round2(ppTimePerStepMs),
        usesInterNode: ppUsesInterNode,
      },
      cp: {
        collectivesPerLayer: CP_COLLECTIVES_PER_LAYER,
        messageSizeBytes: collectiveMessageBytes,
        totalVolumePerStepGB: round2(bytesToGB(cpStats.totalVolumeBytes)),
        timePerStepMs: round2(cpStats.timePerStepMs),
        linkUtilizationPercent: round2(cpStats.linkUtilizationPercent),
        usesInterNode: cpStats.usesInterNode,
      },
      fsdp: {
        collectivesPerLayer: FSDP_COLLECTIVES_PER_LAYER,
        messageSizeBytes: round2(fsdpMessageBytes),
        totalVolumePerStepGB: round2(bytesToGB(fsdpStats.totalVolumeBytes)),
        timePerStepMs: round2(fsdpStats.timePerStepMs),
        linkUtilizationPercent: round2(fsdpStats.linkUtilizationPercent),
        usesInterNode: fsdpStats.usesInterNode,
      },
      ep: {
        allToAllsPerLayer: EP_ALL_TO_ALLS_PER_LAYER,
        messageSizeBytes: round2(epMessageBytes),
        totalVolumePerStepGB: round2(bytesToGB(epStats.totalVolumeBytes)),
        timePerStepMs: round2(epStats.timePerStepMs),
        linkUtilizationPercent: round2(epStats.linkUtilizationPercent),
        usesInterNode: epStats.usesInterNode,
      },
      dp: {
        gradientVolumePerGPU_GB: round2(bytesToGB(gradientCommBytesPerGpu)),
        allReduceTimeMs: round2(dpTimeMs),
        canOverlapWithBackward: canOverlapDp,
        linkUtilizationPercent: round2(dpLinkUtilizationPercent),
      },
    },
    throughput: {
      computeTimePerStepMs: round2(computeTimePerStepMs),
      communicationTimePerStepMs: round2(communicationTimePerStepMs),
      pipelineBubbleFraction: round2(pipelineBubbleFraction),
      pipelineBubbleTimeMs: round2(pipelineBubbleTimeMs),
      totalStepTimeMs: round2(totalStepTimeMs),
      tokensPerSecond: round2(tokensPerSecond),
      mfu: round2(mfu),
    },
    gpuMap,
    links,
  }
}

export const llama7B = (): ModelConfig => ({
  architecture: 'dense',
  hiddenDim: 4096,
  numLayers: 32,
  numHeads: 32,
  numKVHeads: 32,
  vocabSize: 32000,
  intermediateSize: 11008,
  tiedEmbeddings: false,
  attentionProfile: {
    type: 'full',
  },
})

export const llama70B = (): ModelConfig => ({
  architecture: 'dense',
  hiddenDim: 8192,
  numLayers: 80,
  numHeads: 64,
  numKVHeads: 8,
  vocabSize: 32000,
  intermediateSize: 28672,
  tiedEmbeddings: false,
  attentionProfile: {
    type: 'full',
  },
})

export const llama405B = (): ModelConfig => ({
  architecture: 'dense',
  hiddenDim: 16384,
  numLayers: 126,
  numHeads: 128,
  numKVHeads: 8,
  vocabSize: 128256,
  intermediateSize: 53248,
  tiedEmbeddings: false,
  attentionProfile: {
    type: 'full',
  },
})

export const olmo3_32B = (): ModelConfig => ({
  architecture: 'dense',
  hiddenDim: 5120,
  numLayers: 64,
  numHeads: 40,
  numKVHeads: 8,
  vocabSize: 100278,
  intermediateSize: 27648,
  tiedEmbeddings: false,
  attentionProfile: {
    type: 'hybrid',
    slidingWindowSize: 4096,
    globalAttentionFraction: 0.25,
  },
})

export const llama31_405B = (): ModelConfig => ({
  architecture: 'dense',
  hiddenDim: 16384,
  numLayers: 126,
  numHeads: 128,
  numKVHeads: 8,
  vocabSize: 128256,
  intermediateSize: 53248,
  tiedEmbeddings: false,
  attentionProfile: {
    type: 'full',
  },
})

export const trinityLarge400B = (): ModelConfig => ({
  architecture: 'moe',
  hiddenDim: 3072,
  numLayers: 60,
  numHeads: 48,
  numKVHeads: 8,
  vocabSize: 200192,
  intermediateSize: 12288,
  tiedEmbeddings: false,
  attentionProfile: {
    type: 'hybrid',
    slidingWindowSize: 4096,
    globalAttentionEveryN: 4,
  },
  moe: {
    numExperts: 256,
    expertsPerToken: 4,
    numDenseLayers: 6,
    expertIntermediateSize: 3072,
    activeParamsPerToken: 13_000_000_000,
  },
})

export const a100_80gb = (): GPUSpec => ({
  name: 'A100 80GB',
  hbmCapacityGB: 80,
  peakTFLOPsBF16: 312,
  memBandwidthTBs: 2,
})

export const h100_sxm = (): GPUSpec => ({
  name: 'H100 SXM',
  hbmCapacityGB: 80,
  peakTFLOPsBF16: 989,
  memBandwidthTBs: 3.35,
})

export const b300 = (): GPUSpec => ({
  name: 'B300',
  hbmCapacityGB: 192,
  peakTFLOPsBF16: 2250,
  memBandwidthTBs: 8,
})

export const gb200 = (): GPUSpec => ({
  name: 'GB200',
  hbmCapacityGB: 192,
  peakTFLOPsBF16: 2250,
  memBandwidthTBs: 8,
})

export const singleNode8GPU = (gpuType: GPUSpec = a100_80gb()): ClusterConfig => {
  const fabric = getDefaultFabric(gpuType)

  return {
    gpuType,
    gpusPerNode: 8,
    numNodes: 1,
    intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
    interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
    nodesPerRack: 1,
    rackLabel: 'node',
    nodeLabel: 'GPU host',
    podLabel: 'node',
  }
}

export const cluster64GPU = (gpuType: GPUSpec = h100_sxm()): ClusterConfig => {
  const fabric = getDefaultFabric(gpuType)

  return {
    gpuType,
    gpusPerNode: 8,
    numNodes: 8,
    intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
    interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
    nodesPerRack: 4,
    rackLabel: 'rack',
    nodeLabel: 'GPU host',
    podLabel: 'rack',
  }
}

export const frontier576GPU = (): ClusterConfig => {
  const gpuType = gb200()
  const fabric = getDefaultFabric(gpuType)

  return {
    gpuType,
    gpusPerNode: 8,
    numNodes: 72,
    intraNodeBandwidthGBs: fabric.intraNodeBandwidthGBs,
    interNodeBandwidthGBs: fabric.interNodeBandwidthGBs,
    nodesPerRack: 9,
    rackLabel: 'NVL72 rack',
    nodeLabel: 'compute tray',
    podLabel: 'rack',
  }
}