diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..50bb14c4db1c10f694e812d22ef59929e302dece
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,92 @@
+# Git
+.git
+.gitignore
+.github
+
+# Docker
+Dockerfile
+.dockerignore
+
+# Python
+__pycache__
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+venv/
+env/
+ENV/
+.venv/
+.env
+
+# Testing
+.pytest_cache/
+.coverage
+coverage.xml
+htmlcov/
+.tox/
+.mypy_cache/
+.ruff_cache/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Claude
+.claude/
+.mcp.json
+
+# Documentation (source files included, but skip extras)
+docs/
+*.md
+!README.md
+
+# Project specific
+*.log
+.env
+.venv/
+
+# CI/CD
+CODE_OF_CONDUCT.md
+CONTRIBUTING.md
+MARKETING.md
+SECURITY.md
+CHANGELOG.md
+
+# Screenshots and images
+*.png
+*.jpg
+*.jpeg
+*.gif
+!screenshot.png
+
+# Test files
+tests/
+examples/
+configs/
+
+# MCP server config
+.mcp.json
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9b839e9bebc66a3498e285538b5bd85051a4e045
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,40 @@
+# Dockerfile for Hugging Face Spaces
+# GPU Memory Calculator - FastAPI Web Application
+
+FROM python:3.12-slim
+
+# Set working directory
+WORKDIR /app
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONDONTWRITEBYTECODE=1 \
+ PORT=7860
+
+# Install system dependencies
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ gcc \
+ && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy project files
+COPY . .
+
+# Install the package in editable mode
+RUN pip install --no-cache-dir -e .
+
+# Expose Hugging Face Spaces default port
+EXPOSE 7860
+
+# Health check endpoint
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/').read()"
+
+# Run the FastAPI application with uvicorn
+CMD ["uvicorn", "web.app:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/README.md b/README.md
index 902ce660e11c35c768d6103bffd24cf545f2c7a8..82a7ba4f4fb18a6d1663fe79c9a425d64051c99a 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,62 @@
---
-title: Gpu Memory Calculator
-emoji: ๐ป
-colorFrom: gray
-colorTo: yellow
+title: GPU Memory Calculator
+emoji: ๐ฎ
+colorFrom: blue
+colorTo: purple
sdk: docker
pinned: false
-license: apache-2.0
-short_description: Calculates GPU memory for training, inference, and more
+license: mit
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# GPU Memory Calculator
+
+Calculate GPU memory requirements for training and running Large Language Models (LLMs). Supports multiple training engines (PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, FSDP), inference engines (HuggingFace, vLLM, TGI, TensorRT-LLM, SGLang), and multi-node training configurations.
+
+## Features
+
+- **Training Memory Calculation**: Calculate memory for PyTorch DDP, DeepSpeed ZeRO (0-3), Megatron-LM, FSDP, and hybrid approaches
+- **Inference Memory Calculation**: Estimate memory requirements for HuggingFace Transformers, vLLM, TGI, TensorRT-LLM, and SGLang
+- **Multi-Node Support**: Calculate network overhead for distributed training across multiple nodes
+- **Model Presets**: Pre-configured settings for popular models (LLaMA 2, GPT-3, Mixtral, GLM, Qwen, DeepSeek-MoE)
+- **Configuration Export**: Generate configs for Accelerate, Lightning, Axolotl, DeepSpeed, YAML, and JSON
+- **Batch Size Optimization**: Automatically find the maximum batch size that fits in GPU memory
+
+## Supported Training Engines
+
+- PyTorch DDP (Distributed Data Parallel)
+- DeepSpeed ZeRO (Stages 0-3) with CPU/NVMe offloading
+- Megatron-LM (Tensor + Pipeline Parallelism)
+- PyTorch FSDP (Fully Sharded Data Parallel)
+- Megatron-LM + DeepSpeed (Hybrid)
+
+## Supported Inference Engines
+
+- HuggingFace Transformers
+- vLLM (PagedAttention)
+- Text Generation Inference (TGI)
+- TensorRT-LLM
+- SGLang (RadixAttention)
+
+## How to Use
+
+1. **Select a preset model** or configure your own
+2. **Choose training/inference engine** and adjust parameters
+3. **Calculate** memory requirements instantly
+4. **Export** configurations to your preferred framework
+
+## Example Use Cases
+
+- Planning GPU requirements for LLM training
+- Optimizing batch sizes for your hardware
+- Comparing memory efficiency across engines
+- Estimating KV cache memory for inference
+- Calculating multi-node network overhead
+
+## Links
+
+- [GitHub Repository](https://github.com/George614/gpu-mem-calculator)
+- [Documentation](https://github.com/George614/gpu-mem-calculator/blob/main/README.md)
+
+## License
+
+MIT License - see [LICENSE](https://github.com/George614/gpu-mem-calculator/blob/main/LICENSE) for details.
diff --git a/cli/main.py b/cli/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd35d265a2e992d473bdafdff54fa5c1fba7b5c9
--- /dev/null
+++ b/cli/main.py
@@ -0,0 +1,399 @@
+"""CLI interface for GPU Memory Calculator."""
+
+import json
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+import click
+
+if TYPE_CHECKING:
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+ from gpu_mem_calculator.core.models import MemoryResult
+
+
+@click.group()
+@click.version_option(version="0.1.0")
+def main() -> None:
+ """GPU Memory Calculator for LLM Training.
+
+ Calculate GPU memory requirements for training Large Language Models
+ with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
+ """
+ pass
+
+
+@main.command()
+@click.option(
+ "--config",
+ "-c",
+ type=click.Path(exists=True),
+ help="Path to JSON configuration file",
+)
+@click.option(
+ "--preset",
+ "-p",
+ type=str,
+ help="Name of a preset model configuration",
+)
+@click.option(
+ "--output",
+ "-o",
+ type=click.Path(),
+ help="Output file path (default: stdout)",
+)
+@click.option(
+ "--format",
+ "-f",
+ type=click.Choice(["json", "yaml", "table"]),
+ default="table",
+ help="Output format (default: table)",
+)
+def calculate(
+ config: str | None,
+ preset: str | None,
+ output: str | None,
+ format: Literal["json", "yaml", "table"],
+) -> None:
+ """Calculate GPU memory requirements from config file or preset.
+
+ Examples:
+ gpu-mem-calc calculate --config configs/llama2_7b.json
+ gpu-mem-calc calculate --preset llama2-7b
+ gpu-mem-calc calculate -p mixtral-8x7b --format json
+ """
+ if not config and not preset:
+ click.echo("Error: Either --config or --preset is required", err=True)
+ sys.exit(1)
+
+ if config and preset:
+ click.echo("Error: Cannot use both --config and --preset", err=True)
+ sys.exit(1)
+
+ try:
+ import tempfile
+
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+
+ if preset:
+ # Load preset configuration
+ from gpu_mem_calculator.config.presets import get_preset_config
+
+ preset_config = get_preset_config(preset)
+ if preset_config is None:
+ click.echo(
+ f"Error: Preset '{preset}' not found. "
+ "Use 'gpu-mem-calc presets' to list available presets.",
+ err=True,
+ )
+ sys.exit(1)
+
+ # Write preset to temp file for from_config_file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ json.dump(preset_config, f, indent=2)
+ temp_path = f.name
+
+ calculator = GPUMemoryCalculator.from_config_file(temp_path)
+ Path(temp_path).unlink() # Clean up temp file
+ elif config:
+ calculator = GPUMemoryCalculator.from_config_file(config)
+ else:
+ # This should never happen due to the checks above
+ click.echo("Error: Either --config or --preset is required", err=True)
+ sys.exit(1)
+
+ result = calculator.calculate()
+
+ # Format output
+ if format == "json":
+ output_text = json.dumps(result.model_dump(mode="json"), indent=2)
+ elif format == "yaml":
+ try:
+ import yaml # type: ignore[import-untyped]
+
+ output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
+ except ImportError:
+ click.echo(
+ "Error: YAML format requires PyYAML. Install with: pip install pyyaml",
+ err=True,
+ )
+ sys.exit(1)
+ else: # table
+ output_text = _format_result_as_table(result, calculator)
+
+ # Write output
+ if output:
+ Path(output).write_text(output_text)
+ click.echo(f"Results written to {output}")
+ else:
+ click.echo(output_text)
+
+ except Exception as e:
+ click.echo(f"Error: {e}", err=True)
+ sys.exit(1)
+
+
+@main.command()
+@click.argument(
+ "params",
+ type=float,
+ required=True,
+)
+@click.option(
+ "--gpus",
+ "-g",
+ type=int,
+ default=1,
+ help="Number of GPUs (default: 1)",
+)
+@click.option(
+ "--gpu-mem",
+ "-m",
+ type=float,
+ default=80.0,
+ help="GPU memory in GB (default: 80.0)",
+)
+@click.option(
+ "--engine",
+ "-e",
+ type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
+ default="pytorch",
+ help="Training engine (default: pytorch)",
+)
+@click.option(
+ "--dtype",
+ "-d",
+ type=click.Choice(["fp32", "fp16", "bf16"]),
+ default="bf16",
+ help="Data type (default: bf16)",
+)
+def quick(
+ params: float,
+ gpus: int,
+ gpu_mem: float,
+ engine: str,
+ dtype: str,
+) -> None:
+ """Quick calculation from model size (in billions of parameters).
+
+ Example:
+ gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
+ """
+ try:
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+ from gpu_mem_calculator.core.models import (
+ DType,
+ EngineConfig,
+ EngineType,
+ GPUConfig,
+ ModelConfig,
+ ParallelismConfig,
+ TrainingConfig,
+ )
+
+ # Map engine string to EngineType
+ engine_map = {
+ "pytorch": EngineType.PYTORCH_DDP,
+ "deepspeed": EngineType.DEEPSPEED,
+ "megatron": EngineType.MEGATRON_LM,
+ "fsdp": EngineType.FSDP,
+ }
+
+ # Map dtype string to DType
+ dtype_map = {
+ "fp32": DType.FP32,
+ "fp16": DType.FP16,
+ "bf16": DType.BF16,
+ }
+
+ # Create a minimal config for quick calculation
+ # Estimate model architecture from parameter count
+ # Rough approximation based on typical transformer models
+ num_params = int(params * 1e9)
+
+ # Estimate hidden size and layers from param count
+ # These are rough approximations
+ if params <= 1:
+ hidden_size, num_layers = 768, 12
+ elif params <= 7:
+ hidden_size, num_layers = 4096, 32
+ elif params <= 13:
+ hidden_size, num_layers = 5120, 40
+ elif params <= 30:
+ hidden_size, num_layers = 6656, 60
+ elif params <= 65:
+ hidden_size, num_layers = 8192, 80
+ else:
+ hidden_size, num_layers = 12288, 96
+
+ model_config = ModelConfig(
+ name="quick-estimate",
+ num_parameters=num_params,
+ num_layers=num_layers,
+ hidden_size=hidden_size,
+ num_attention_heads=hidden_size // 128,
+ vocab_size=32000,
+ max_seq_len=2048,
+ )
+
+ training_config = TrainingConfig(
+ batch_size=1,
+ gradient_accumulation_steps=1,
+ dtype=dtype_map[dtype],
+ )
+
+ parallelism_config = ParallelismConfig(data_parallel_size=gpus)
+
+ engine_config = EngineConfig(
+ type=engine_map[engine],
+ zero_stage=2 if engine == "deepspeed" else None,
+ )
+
+ gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
+
+ calculator = GPUMemoryCalculator(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ gpu_config=gpu_config,
+ )
+
+ result = calculator.calculate()
+
+ # Display results
+ click.echo(_format_result_as_table(result, calculator))
+
+ except Exception as e:
+ click.echo(f"Error: {e}", err=True)
+ sys.exit(1)
+
+
+@main.command()
+@click.argument(
+ "config_path",
+ type=click.Path(exists=True),
+)
+def validate(config_path: str) -> None:
+ """Validate a configuration file.
+
+ Example:
+ gpu-mem-calc validate configs/my_config.json
+ """
+ try:
+ from gpu_mem_calculator.config import ConfigParser
+
+ ConfigParser.parse_full_config(config_path)
+ click.echo(f"โ Configuration file '{config_path}' is valid")
+
+ except Exception as e:
+ click.echo(f"โ Validation failed: {e}", err=True)
+ sys.exit(1)
+
+
+@main.command()
+@click.option(
+ "--format",
+ "-f",
+ type=click.Choice(["list", "json", "table"]),
+ default="list",
+ help="Output format (default: list)",
+)
+def presets(format: str) -> None:
+ """List available model preset configurations.
+
+ Examples:
+ gpu-mem-calc presets
+ gpu-mem-calc presets --format table
+ gpu-mem-calc presets -f json
+ """
+ try:
+ from gpu_mem_calculator.config.presets import list_presets
+
+ all_presets = list_presets()
+
+ if not all_presets:
+ click.echo("No presets found.")
+ return
+
+ if format == "json":
+ click.echo(json.dumps(all_presets, indent=2))
+ elif format == "table":
+ from rich.console import Console
+ from rich.table import Table
+
+ console = Console()
+ table = Table(
+ title="Available Model Presets",
+ show_header=True,
+ header_style="bold magenta",
+ )
+ table.add_column("Preset Name", style="cyan", width=25)
+ table.add_column("Display Name", style="green", width=30)
+ table.add_column("Description", style="yellow")
+
+ for name, info in sorted(all_presets.items()):
+ table.add_row(name, info["display_name"], info["description"])
+
+ console.print(table)
+ else: # list format
+ click.echo("Available model presets:\n")
+ for name, info in sorted(all_presets.items()): # type: ignore[annotation-unchecked]
+ click.echo(f" {name:25} - {info['display_name']}")
+ if info.get("description"):
+ click.echo(f"{'':27}{info['description']}")
+ click.echo()
+
+ except Exception as e:
+ click.echo(f"Error: {e}", err=True)
+ sys.exit(1)
+
+
+def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
+ """Format result as ASCII table."""
+ from rich.console import Console
+ from rich.table import Table
+
+ console = Console()
+
+ # Main results table
+ table = Table(
+ title="GPU Memory Calculation Results",
+ show_header=True,
+ header_style="bold magenta",
+ )
+ table.add_column("Metric", style="cyan", width=30)
+ table.add_column("Value", style="green")
+
+ # Memory results
+ table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
+ table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
+ table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
+ table.add_row("", "") # Spacer
+
+ # Breakdown
+ table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
+ table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
+ table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
+ table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
+ table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
+ table.add_row("", "") # Spacer
+
+ # Feasibility
+ status = "โ Fits" if result.fits_on_gpu else "โ OOM"
+ table.add_row("Status", status)
+ table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
+ if result.recommended_batch_size:
+ table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
+
+ # Capture table output
+ from io import StringIO
+
+ buffer = StringIO()
+ console.file = buffer
+ console.print(table)
+ return buffer.getvalue()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c5dce0051d91981005d0a3243345fb0c374ef1a1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+# GPU Memory Calculator - Requirements for Hugging Face Spaces
+
+# Core dependencies
+pydantic>=2.0.0
+click>=8.1.0
+pydantic-settings>=2.0.0
+rich>=13.0.0
+
+# Web dependencies
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+jinja2>=3.1.0
diff --git a/src/gpu_mem_calculator.egg-info/PKG-INFO b/src/gpu_mem_calculator.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..f688d67e80ae6860491c9404be9970393de206cf
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/PKG-INFO
@@ -0,0 +1,720 @@
+Metadata-Version: 2.4
+Name: gpu-mem-calculator
+Version: 0.1.0
+Summary: GPU Memory Calculator for LLM Training
+Author: GPU Mem Calculator Team
+License: MIT
+Project-URL: Homepage, https://github.com/George614/gpu-mem-calculator
+Project-URL: Repository, https://github.com/George614/gpu-mem-calculator
+Project-URL: Issues, https://github.com/George614/gpu-mem-calculator/issues
+Keywords: gpu,memory,calculator,llm,large-language-model,training,deepspeed,megatron,pytorch,fsdp,transformer,machine-learning,deep-learning,distributed-training,zero-optimization
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: click>=8.1.0
+Requires-Dist: pydantic-settings>=2.0.0
+Requires-Dist: rich>=13.0.0
+Provides-Extra: web
+Requires-Dist: fastapi>=0.100.0; extra == "web"
+Requires-Dist: uvicorn[standard]>=0.23.0; extra == "web"
+Requires-Dist: jinja2>=3.1.0; extra == "web"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: mypy>=1.5.0; extra == "dev"
+Dynamic: license-file
+
+# GPU Memory Calculator for LLM Training
+
+[](https://opensource.org/licenses/MIT)
+[](https://www.python.org/downloads/)
+[](https://github.com/psf/black)
+[](CONTRIBUTING.md)
+
+A versatile Python application for calculating GPU memory requirements for training Large Language Models with support for multiple training engines including PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, and FSDP.
+
+๐ **[Getting Started Guide](docs/GETTING_STARTED.md)** | ๐ฌ **[FAQ](docs/FAQ.md)** | ๐ค **[Contributing](CONTRIBUTING.md)**
+
+
+
+
+
+## ๐ Why Use This Tool?
+
+Training large language models requires careful memory planning. This calculator helps you:
+
+- **๐ฐ Save costs** by determining the optimal GPU configuration before you start training
+- **โก Avoid OOM errors** by validating your training configuration fits in GPU memory
+- **๐ Compare strategies** across different training engines (DeepSpeed, Megatron, FSDP)
+- **๐ฏ Plan infrastructure** by knowing exactly how many GPUs you need
+- **๐ Scale efficiently** with detailed memory breakdowns for optimization
+
+Whether you're training a 7B parameter model on a single GPU or a 175B model across hundreds of GPUs, this tool provides accurate memory estimates based on proven formulas from DeepSpeed, Megatron-LM, and the latest research.
+
+## โจ Features
+
+### Core Training Calculation
+- ๐ง **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 1-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
+- ๐ฅ๏ธ **Dual Interface**: Both CLI and Web UI for flexible usage
+- ๐ฏ **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, etc.)
+- ๐ **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
+- โ
**Feasibility Analysis**: Check if your configuration fits on available GPU memory
+- โ๏ธ **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
+
+### ๐ Inference Memory Calculation
+- ๐ **Multi-Engine Support**: HuggingFace Transformers, vLLM, TGI, TensorRT-LLM
+- ๐พ **KV Cache Optimization**: Quantization options (NONE, INT8, FP8, INT4)
+- ๐ **Tensor Parallelism**: Automatic memory distribution across GPUs
+- ๐ **Throughput Estimation**: Tokens/second estimates for capacity planning
+- ๐ฏ **Batch Size Optimization**: Find maximum batch size for your hardware
+
+### ๐ Multi-Node Training
+- ๐ **Network Overhead Calculation**: AllReduce, AllGather, ReduceScatter, pipeline communication
+- ๐ก **Interconnect Support**: InfiniBand, NVLink, Ethernet (10G/25G/100G/200G)
+- โก **Hybrid Parallelism Optimization**: Automatic TP+PP+DP strategy optimization
+- ๐ง **ZeRO Stage Impact Analysis**: Compare communication overhead across ZeRO stages
+
+### ๐ Framework Configuration Exporters
+- ๐ฆ **Accelerate Export**: HuggingFace Accelerate config generation
+- โก **Lightning Export**: PyTorch Lightning Trainer configuration
+- ๐ฅ **Axolotl Export**: YAML config for fine-tuning
+- ๐ **File Export**: Save to YAML/JSON formats
+- ๐๏ธ **Format Conversion**: Convert between different framework configs
+
+## ๐ฆ Installation
+
+### Quick Start
+
+### Core Capabilities
+- **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 0-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
+- **Dual Interface**: Both CLI and Web UI for flexible usage
+- **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, GLM, Mixtral, etc.)
+- **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
+- **Feasibility Analysis**: Check if your configuration fits on available GPU memory
+- **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
+
+### Web UI Enhancements
+- **Formula Explanations**: See exactly how memory is calculated with your values plugged in
+- **Real-time Validation**: Client-side validation prevents invalid configurations
+- **Smart Auto-calculation**: Optimized debouncing (1s) with minimum interval protection
+- **Export Capabilities**: Export to DeepSpeed config files, JSON, or copy to clipboard
+- **Batch Size Optimizer**: Automatically find maximum batch size that fits
+- **Comparison Mode**: Save and compare different configurations side-by-side
+- **Accessibility Features**: ARIA labels, keyboard navigation, colorblind-friendly charts
+
+### Advanced Features
+- **MoE Support**: Mixture of Experts models with configurable experts and top-k routing
+- **CPU/NVMe Offloading**: Offload optimizer states and parameters to CPU or NVMe storage
+- **Activation Checkpointing**: 5 levels from none to full checkpointing
+- **Sequence Parallelism**: Optimize memory for long sequences
+- **Result Caching**: Fast repeated calculations with built-in caching
+
+```bash
+pip install git+https://github.com/George614/gpu-mem-calculator.git
+```
+
+### From source
+
+```bash
+git clone https://github.com/George614/gpu-mem_calculator.git
+cd gpu_mem_calculator
+pip install -e .
+```
+
+### For Web UI support
+
+```bash
+pip install -e ".[web]"
+```
+
+### Development installation
+
+```bash
+pip install -e ".[dev]"
+```
+
+## ๐ Use Cases
+
+### Research & Academia
+- Estimate GPU requirements for research projects before requesting compute resources
+- Plan multi-GPU training configurations for large-scale experiments
+- Compare memory efficiency of different training strategies
+
+### Industry & Production
+- Cost optimization: Choose the right GPU type and count for your training workload
+- Capacity planning: Forecast infrastructure needs for model development
+- Debugging: Diagnose OOM errors and optimize memory usage
+
+### Education & Learning
+- Understand how training configuration affects memory consumption
+- Learn about different distributed training strategies
+- Experiment with various optimization techniques safely
+
+## ๐ Usage
+
+### Command Line Interface
+
+#### Using model presets (Recommended)
+
+The calculator includes pre-configured model presets for popular LLMs:
+
+```bash
+# List all available presets
+gpu-mem-calc presets
+
+# Calculate with a preset
+gpu-mem-calc calculate --preset llama2-7b
+gpu-mem-calc calculate --preset mixtral-8x7b --format json
+
+# List presets in table format
+gpu-mem-calc presets --format table
+```
+
+Available presets include:
+- **Dense Models**: LLaMA 2 (7B, 13B, 70B), GPT-3 (175B)
+- **MoE Models**: Mixtral 8x7B, GLM-4 (9B), GLM-4.7 (355B), GLM-4.5 Air (106B),
+ Qwen1.5-MoE-A2.7B, DeepSeek-MoE (16B)
+
+#### Calculate from config file
+
+```bash
+gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
+```
+
+#### Quick calculation from model size
+
+```bash
+# Calculate memory for 7B model with 8x80GB GPUs using DeepSpeed
+gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
+
+# With custom GPU memory
+gpu-mem-calc quick 70 --gpus 64 --gpu-mem 80 --engine megatron
+```
+
+#### Validate configuration
+
+```bash
+gpu-mem-calc validate configs/my_config.json
+```
+
+### Web Interface
+
+Start the web server:
+
+```bash
+python -m gpu_mem_calculator.web.app
+```
+
+Or using uvicorn directly:
+
+```bash
+uvicorn gpu_mem_calculator.web.app:app --reload
+```
+
+Then open your browser to `http://localhost:8000`
+
+### Python API
+
+#### Training Memory Calculation
+
+```python
+from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+from gpu_mem_calculator.core.models import (
+ ModelConfig,
+ TrainingConfig,
+ ParallelismConfig,
+ EngineConfig,
+ GPUConfig,
+)
+
+# Create configuration
+model_config = ModelConfig(
+ name="llama2-7b",
+ num_parameters=7_000_000_000,
+ num_layers=32,
+ hidden_size=4096,
+ num_attention_heads=32,
+ vocab_size=32000,
+ max_seq_len=4096,
+)
+
+training_config = TrainingConfig(
+ batch_size=4,
+ gradient_accumulation_steps=4,
+ dtype="bf16",
+ optimizer="adamw",
+)
+
+parallelism_config = ParallelismConfig(
+ data_parallel_size=8,
+)
+
+engine_config = EngineConfig(
+ type="deepspeed",
+ zero_stage=3,
+ offload_optimizer="cpu",
+)
+
+gpu_config = GPUConfig(
+ num_gpus=8,
+ gpu_memory_gb=80,
+)
+
+# Calculate memory
+calculator = GPUMemoryCalculator(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ gpu_config=gpu_config,
+)
+
+result = calculator.calculate()
+
+print(f"Memory per GPU: {result.total_memory_per_gpu_gb:.2f} GB")
+print(f"Fits on GPU: {result.fits_on_gpu}")
+print(f"Utilization: {result.memory_utilization_percent:.1f}%")
+```
+
+#### ๐ Inference Memory Calculation
+
+```python
+from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
+from gpu_mem_calculator.core.models import (
+ ModelConfig,
+ InferenceConfig,
+ InferenceEngineType,
+ GPUConfig,
+)
+
+# Create configurations
+model_config = ModelConfig(
+ name="llama2-7b",
+ num_parameters=7_000_000_000,
+ num_layers=32,
+ hidden_size=4096,
+ num_attention_heads=32,
+ max_seq_len=4096,
+)
+
+inference_config = InferenceConfig(
+ batch_size=32,
+ kv_cache_quantization="int8", # NONE, INT8, FP8, INT4
+ tensor_parallel_size=2,
+ gpu_memory_utilization=0.9,
+)
+
+gpu_config = GPUConfig(num_gpus=2, gpu_memory_gb=80)
+
+# Calculate for different inference engines
+calculator = InferenceMemoryCalculator(model_config, inference_config, gpu_config)
+
+# vLLM inference
+result_vllm = calculator.calculate(InferenceEngineType.VLLM)
+print(f"vLLM: {result_vllm.total_memory_per_gpu_gb:.2f} GB")
+print(f"Max batch size: {result_vllm.max_supported_batch_size}")
+print(f"Throughput: {result_vllm.estimated_throughput_tokens_per_sec:.0f} tokens/sec")
+
+# TensorRT-LLM inference
+result_trt = calculator.calculate(InferenceEngineType.TENSORRT_LLM)
+print(f"TensorRT-LLM: {result_trt.total_memory_per_gpu_gb:.2f} GB")
+```
+
+#### ๐ Multi-Node Network Overhead
+
+```python
+from gpu_mem_calculator.core.multinode import MultiNodeCalculator
+from gpu_mem_calculator.core.models import (
+ NodeConfig,
+ InterconnectType,
+)
+
+# Configure multi-node setup
+node_config = NodeConfig(
+ num_nodes=4,
+ gpus_per_node=8,
+ interconnect_type=InterconnectType.INFINIBAND,
+)
+
+calculator = MultiNodeCalculator(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ node_config=node_config,
+ engine_config=engine_config,
+)
+
+# Calculate network overhead
+network_overhead = calculator.calculate_network_overhead()
+print(f"AllReduce: {network_overhead.allreduce_gb:.2f} GB")
+print(f"AllGather: {network_overhead.allgather_gb:.2f} GB")
+print(f"Time overhead: {network_overhead.estimated_overhead_ms_per_step:.2f} ms/step")
+
+# Optimize hybrid parallelism
+from gpu_mem_calculator.core.models import HybridParallelismConfig
+
+hybrid_config = HybridParallelismConfig(
+ auto_optimize=True,
+ prefer_pipeline_parallel=True,
+ enable_sequence_parallel=True,
+)
+
+optimized_parallelism = calculator.optimize_hybrid_parallelism(hybrid_config)
+print(f"Optimized TP: {optimized_parallelism.tensor_parallel_size}")
+print(f"Optimized PP: {optimized_parallelism.pipeline_parallel_size}")
+print(f"Optimized DP: {optimized_parallelism.data_parallel_size}")
+```
+
+#### ๐ Export Framework Configurations
+
+```python
+from gpu_mem_calculator.exporters.manager import ExportManager, ExportFormat
+
+# Create export manager
+manager = ExportManager(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ node_config=node_config,
+)
+
+# Export to different formats
+accelerate_config = manager.export(ExportFormat.ACCELERATE)
+lightning_config = manager.export(ExportFormat.LIGHTNING)
+axolotl_config = manager.export(ExportFormat.AXOLOTL)
+
+# Export to file
+manager.export_to_file(ExportFormat.ACCELERATE, "accelerate_config.yaml")
+manager.export_to_file(ExportFormat.JSON, "config.json")
+
+# Get DeepSpeed config
+deepspeed_config = manager.export(ExportFormat.DEEPSPEED)
+```
+
+## Configuration File Format
+
+```json
+{
+ "model": {
+ "name": "llama2-7b",
+ "num_parameters": "7B",
+ "num_layers": 32,
+ "hidden_size": 4096,
+ "num_attention_heads": 32,
+ "vocab_size": 32000,
+ "max_seq_len": 4096
+ },
+ "training": {
+ "batch_size": 4,
+ "gradient_accumulation_steps": 4,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 1
+ },
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 8,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 8,
+ "gpu_memory_gb": 80
+ }
+}
+```
+
+## Supported Training Engines
+
+### PyTorch DDP (Baseline)
+Standard Distributed Data Parallel training without memory optimizations.
+
+### DeepSpeed ZeRO
+- **ZeRO-1**: Shard optimizer states
+- **ZeRO-2**: Shard optimizer states + gradients
+- **ZeRO-3**: Shard everything (parameters, gradients, optimizer states)
+- Supports CPU/NVMe offloading
+
+### Megatron-LM
+Tensor and pipeline parallelism with activation checkpointing support.
+
+### Megatron + DeepSpeed
+Combines Megatron-LM's model parallelism with DeepSpeed ZeRO's optimizer sharding.
+
+### PyTorch FSDP
+Fully Sharded Data Parallel with multiple sharding strategies.
+
+## Memory Formulas
+
+The calculator uses formulas verified against authoritative sources:
+
+### Base Components
+
+**Model Parameters:**
+- FP16/BF16: `num_params ร 2 bytes`
+- FP32: `num_params ร 4 bytes`
+
+**Gradients:**
+- FP16/BF16: `num_params ร 2 bytes`
+- FP32: `num_params ร 4 bytes`
+
+**Optimizer States** (per optimizer type):
+- **Adam/AdamW**: `num_params ร 12 bytes`
+ - 4 bytes: FP32 parameter copy
+ - 4 bytes: Momentum
+ - 4 bytes: Variance
+- **AdamW 8-bit**: `num_params ร 2 bytes` (quantized)
+- **SGD**: `num_params ร 4 bytes` (FP32 only, no momentum)
+
+**Activations:**
+- Approximation: `batch_size ร seq_len ร hidden_size ร num_layers ร ~16 bytes/token/layer`
+- Varies based on activation checkpointing level
+
+### DeepSpeed ZeRO Stages
+
+**ZeRO-0** (Baseline - same as PyTorch DDP):
+```
+total_per_gpu = 2รparams + 2รparams + 12รparams + activations
+ = 16รparams + activations
+```
+
+**ZeRO-1** (Shard optimizer states):
+```
+total_per_gpu = 2รparams + 2รparams + (12รparams)/num_gpus + activations
+```
+
+**ZeRO-2** (Shard optimizer + gradients):
+```
+total_per_gpu = 2รparams + (2รparams)/num_gpus + (12รparams)/num_gpus + activations
+```
+
+**ZeRO-3** (Shard everything):
+```
+total_per_gpu = largest_layer_memory + (16รparams)/num_gpus + activations
+where largest_layer_memory โ 4ร(num_params/10)
+```
+
+**CPU/NVMe Offloading:**
+- Optimizer states offloaded to CPU: 0 GB GPU memory
+- Parameters offloaded to CPU/NVMe: Dynamically gathered during compute
+
+### Verification
+
+All formulas have been verified against:
+- โ
18 comprehensive test scenarios (100% pass rate)
+- โ
EleutherAI Transformer Math 101
+- โ
Microsoft Research ZeRO Blog
+- โ
DeepSpeed Official Documentation
+- โ
PyTorch FSDP Documentation
+
+### References
+
+- [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive transformer memory breakdown
+- [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
+- [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
+
+## Example Configurations
+
+### LLaMA 2 7B with DeepSpeed ZeRO-3
+```bash
+gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
+```
+
+### GPT-3 175B with Megatron-LM
+```bash
+gpu-mem-calc calculate --config configs/gpt3_175b_megatron.json
+```
+
+### Custom 1B model with PyTorch DDP
+```bash
+gpu-mem-calc calculate --config configs/pytorch_ddp_example.json
+```
+
+## Web UI Features
+
+### Interactive Interface
+- **Real-time Calculations**: Auto-calculates as you adjust parameters (1s debounce)
+- **Client-side Validation**: Instant feedback on configuration errors before API calls
+- **Smart Presets**: Quick-load model configurations (LLaMA 2, GPT-3, GLM, Mixtral, Qwen, DeepSeek)
+- **Visual Breakdown**: Color-coded bar chart with patterns for colorblind accessibility
+- **Feasibility Status**: Clear indicators showing if configuration fits on GPU
+
+### Formula Explanations
+- **Detailed Breakdowns**: See exact formulas used with your values plugged in
+- **Component-by-Component**: Each memory component explained with formula and result
+- **Authoritative References**: Links to EleutherAI, Microsoft Research, DeepSpeed docs
+- **Engine-Specific Details**: Different formulas for PyTorch DDP, DeepSpeed ZeRO, FSDP, Megatron-LM
+
+### Advanced Tools
+- **Export to DeepSpeed**: Generate `deepspeed_config.json` files automatically
+- **Batch Size Optimizer**: Find maximum batch size that fits your GPU memory
+- **Config Persistence**: Save configurations to browser localStorage
+- **Comparison Mode**: Compare different configurations side-by-side
+
+### Accessibility
+- **ARIA Labels**: Full screen reader support throughout the interface
+- **Keyboard Navigation**: All features accessible via keyboard
+- **Colorblind-Friendly**: Patterns and textures supplement colors in charts
+- **High Contrast**: Clear visual indicators with multiple cues
+
+### API Endpoints
+- `POST /api/calculate` - Calculate GPU memory requirements
+- `POST /api/explain-formula` - Get detailed formula explanation
+- `POST /api/export/deepspeed` - Export DeepSpeed config file
+- `POST /api/optimize/batch-size` - Find maximum batch size
+- `GET /api/preset/{preset_name}` - Load model preset
+
+## Development
+
+### Running Tests
+
+```bash
+pytest tests/
+```
+
+### Test Coverage
+
+The calculator includes comprehensive testing:
+- **Unit Tests**: Core calculation logic for each engine type
+- **Integration Tests**: End-to-end configuration validation
+- **Formula Verification**: 18 scenarios verifying formula accuracy
+- **API Tests**: Web API endpoint testing
+- **Accessibility Tests**: Screen reader and keyboard navigation
+
+All formulas verified accurate against authoritative sources with 100% test pass rate.
+
+### Code Formatting
+
+```bash
+black src/ cli/ web/
+ruff check src/ cli/ web/
+```
+
+### Type Checking
+
+```bash
+mypy src/
+```
+
+## Recent Improvements
+
+### Latest Updates
+- โจ Added formula explanation feature with detailed breakdowns
+- โจ Added client-side validation for better UX
+- โจ Added batch size optimizer API
+- โจ Added DeepSpeed config export functionality
+- โจ Added comprehensive input validation
+- โจ Added result caching for performance
+- โฟ Added ARIA labels for full accessibility
+- โฟ Added colorblind patterns to charts
+- ๐ Fixed optimizer formulas to be optimizer-specific
+- ๐ Fixed Pydantic namespace warnings
+
+### Verification Status
+- โ
All 18 test scenarios passing (100%)
+- โ
Formulas verified against EleutherAI, Microsoft Research, DeepSpeed docs
+- โ
Optimizer formulas corrected for AdamW, AdamW 8-bit, and SGD
+- โ
ZeRO stage formulas validated (0, 1, 2, 3)
+- โ
Engine type formulas validated (PyTorch DDP, DeepSpeed, FSDP, Megatron-LM)
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit a Pull Request. See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
+
+## ๐ References
+
+The memory calculations in this tool are based on authoritative sources:
+
+**Core Memory Formulas:**
+- [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive breakdown of transformer memory requirements
+- [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
+- [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2204.13323) - Activation checkpointing strategies
+
+**Engine Documentation:**
+- [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
+- [NVIDIA Megatron-LM](https://github.com/NVIDIA/Megatron-LM) - Tensor and pipeline parallelism
+- [PyTorch FSDP Documentation](https://pytorch.org/docs/stable/fsdp.html) - Fully sharded data parallel
+- [PyTorch DDP Tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) - Distributed data parallel
+
+**Related Tools:**
+- [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis
+- [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation utilities
+
+## ๐ค Community & Support
+
+- ๐ [Documentation](README.md)
+- ๐ [Issue Tracker](https://github.com/George614/gpu-mem-calculator/issues)
+- ๐ฌ [Discussions](https://github.com/George614/gpu-mem-calculator/discussions)
+- ๐ง Contact the maintainers via GitHub
+
+### Star History
+
+If you find this tool useful, please consider giving it a star! โญ
+
+## ๐ Roadmap
+
+- [x] Inference memory calculation
+- [x] Multi-node training configurations
+- [x] Export to training framework configs (Accelerate, Lightning, Axolotl)
+- [ ] PyPI package distribution
+- [ ] Support for more model architectures (Vision Transformers, Diffusion models)
+- [ ] Real-time memory monitoring dashboard
+- [ ] CLI commands for inference and export features
+
+## ๐ Acknowledgments
+
+This tool was inspired by and builds upon the excellent work of:
+- [DeepSpeed Memory Estimator](https://deepspeed.readthedocs.io/en/latest/memory.html) - ZeRO memory optimization formulas
+- [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis methodology
+- [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation approach
+
+Special thanks to the EleutherAI community for their comprehensive [Transformer Math 101](https://blog.eleuther.ai/transformer-math/) guide, which provides detailed formulas for transformer memory calculations.
+
+## ๐ License
+
+MIT License - see [LICENSE](LICENSE) for details.
+
+## ๐ Citation
+
+If you use this tool in your research, please cite:
+
+```bibtex
+@software{gpu_mem_calculator,
+ title = {GPU Memory Calculator for LLM Training},
+ author = {GPU Mem Calculator Team},
+ year = {2024},
+ url = {https://github.com/George614/gpu-mem-calculator}
+}
+```
+
+---
+
+
+ Made with โค๏ธ for the ML community
+
+
+
+ โญ Star us on GitHub โข
+ ๐ Report a Bug โข
+ ๐ก Request a Feature
+
+
diff --git a/src/gpu_mem_calculator.egg-info/SOURCES.txt b/src/gpu_mem_calculator.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ab30d6f4c4ddbbe18d80cc890b1c66dc873abfc
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/SOURCES.txt
@@ -0,0 +1,46 @@
+LICENSE
+README.md
+pyproject.toml
+src/gpu_mem_calculator/__init__.py
+src/gpu_mem_calculator/py.typed
+src/gpu_mem_calculator.egg-info/PKG-INFO
+src/gpu_mem_calculator.egg-info/SOURCES.txt
+src/gpu_mem_calculator.egg-info/dependency_links.txt
+src/gpu_mem_calculator.egg-info/entry_points.txt
+src/gpu_mem_calculator.egg-info/requires.txt
+src/gpu_mem_calculator.egg-info/top_level.txt
+src/gpu_mem_calculator/cli/__init__.py
+src/gpu_mem_calculator/cli/main.py
+src/gpu_mem_calculator/config/__init__.py
+src/gpu_mem_calculator/config/parser.py
+src/gpu_mem_calculator/config/presets.py
+src/gpu_mem_calculator/core/__init__.py
+src/gpu_mem_calculator/core/calculator.py
+src/gpu_mem_calculator/core/formulas.py
+src/gpu_mem_calculator/core/models.py
+src/gpu_mem_calculator/core/multinode.py
+src/gpu_mem_calculator/engines/__init__.py
+src/gpu_mem_calculator/engines/base.py
+src/gpu_mem_calculator/engines/deepspeed.py
+src/gpu_mem_calculator/engines/fsdp.py
+src/gpu_mem_calculator/engines/megatron.py
+src/gpu_mem_calculator/engines/pytorch.py
+src/gpu_mem_calculator/exporters/__init__.py
+src/gpu_mem_calculator/exporters/accelerate.py
+src/gpu_mem_calculator/exporters/axolotl.py
+src/gpu_mem_calculator/exporters/lightning.py
+src/gpu_mem_calculator/exporters/manager.py
+src/gpu_mem_calculator/inference/__init__.py
+src/gpu_mem_calculator/inference/base.py
+src/gpu_mem_calculator/inference/calculator.py
+src/gpu_mem_calculator/inference/huggingface.py
+src/gpu_mem_calculator/inference/tensorrt_llm.py
+src/gpu_mem_calculator/inference/tgi.py
+src/gpu_mem_calculator/inference/vllm.py
+src/gpu_mem_calculator/utils/__init__.py
+src/gpu_mem_calculator/utils/precision.py
+tests/test_calculator.py
+tests/test_comprehensive.py
+tests/test_exporters.py
+tests/test_inference.py
+tests/test_multinode.py
\ No newline at end of file
diff --git a/src/gpu_mem_calculator.egg-info/dependency_links.txt b/src/gpu_mem_calculator.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/gpu_mem_calculator.egg-info/entry_points.txt b/src/gpu_mem_calculator.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e1fd80a3a8d68ab65662f055ef4d082e9c1c1bf1
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+gpu-mem-calc = gpu_mem_calculator.cli:main
diff --git a/src/gpu_mem_calculator.egg-info/requires.txt b/src/gpu_mem_calculator.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d241d2a18141a662cbb9f832803514e123830104
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/requires.txt
@@ -0,0 +1,16 @@
+pydantic>=2.0.0
+click>=8.1.0
+pydantic-settings>=2.0.0
+rich>=13.0.0
+
+[dev]
+pytest>=7.0.0
+pytest-cov>=4.0.0
+black>=23.0.0
+ruff>=0.1.0
+mypy>=1.5.0
+
+[web]
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+jinja2>=3.1.0
diff --git a/src/gpu_mem_calculator.egg-info/top_level.txt b/src/gpu_mem_calculator.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b26471d7f90301fe8c843f68393d1f9a6065626b
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/top_level.txt
@@ -0,0 +1 @@
+gpu_mem_calculator
diff --git a/src/gpu_mem_calculator/__init__.py b/src/gpu_mem_calculator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5c38d92c579824dcc8f2cb8e2e90e74505c2cb
--- /dev/null
+++ b/src/gpu_mem_calculator/__init__.py
@@ -0,0 +1,3 @@
+"""GPU Memory Calculator for LLM Training."""
+
+__version__ = "0.1.0"
diff --git a/src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0672fbc3b0e1a5956876bbb7c37c70870f0285d3
Binary files /dev/null and b/src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/cli/__init__.py b/src/gpu_mem_calculator/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ba3b37df278c165fe3f50a4e6ee520b4217638
--- /dev/null
+++ b/src/gpu_mem_calculator/cli/__init__.py
@@ -0,0 +1,5 @@
+"""CLI interface for GPU Memory Calculator."""
+
+from gpu_mem_calculator.cli.main import main
+
+__all__ = ["main"]
diff --git a/src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..180200bcd8eef434ddcee44e1b5db827548cf013
Binary files /dev/null and b/src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc b/src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ad0e5dff8e68e55f2ea1d97f3114f1e07ec57ac
Binary files /dev/null and b/src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/cli/main.py b/src/gpu_mem_calculator/cli/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd35d265a2e992d473bdafdff54fa5c1fba7b5c9
--- /dev/null
+++ b/src/gpu_mem_calculator/cli/main.py
@@ -0,0 +1,399 @@
+"""CLI interface for GPU Memory Calculator."""
+
+import json
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+import click
+
+if TYPE_CHECKING:
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+ from gpu_mem_calculator.core.models import MemoryResult
+
+
+@click.group()
+@click.version_option(version="0.1.0")
+def main() -> None:
+ """GPU Memory Calculator for LLM Training.
+
+ Calculate GPU memory requirements for training Large Language Models
+ with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
+ """
+ pass
+
+
+@main.command()
+@click.option(
+ "--config",
+ "-c",
+ type=click.Path(exists=True),
+ help="Path to JSON configuration file",
+)
+@click.option(
+ "--preset",
+ "-p",
+ type=str,
+ help="Name of a preset model configuration",
+)
+@click.option(
+ "--output",
+ "-o",
+ type=click.Path(),
+ help="Output file path (default: stdout)",
+)
+@click.option(
+ "--format",
+ "-f",
+ type=click.Choice(["json", "yaml", "table"]),
+ default="table",
+ help="Output format (default: table)",
+)
+def calculate(
+ config: str | None,
+ preset: str | None,
+ output: str | None,
+ format: Literal["json", "yaml", "table"],
+) -> None:
+ """Calculate GPU memory requirements from config file or preset.
+
+ Examples:
+ gpu-mem-calc calculate --config configs/llama2_7b.json
+ gpu-mem-calc calculate --preset llama2-7b
+ gpu-mem-calc calculate -p mixtral-8x7b --format json
+ """
+ if not config and not preset:
+ click.echo("Error: Either --config or --preset is required", err=True)
+ sys.exit(1)
+
+ if config and preset:
+ click.echo("Error: Cannot use both --config and --preset", err=True)
+ sys.exit(1)
+
+ try:
+ import tempfile
+
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+
+ if preset:
+ # Load preset configuration
+ from gpu_mem_calculator.config.presets import get_preset_config
+
+ preset_config = get_preset_config(preset)
+ if preset_config is None:
+ click.echo(
+ f"Error: Preset '{preset}' not found. "
+ "Use 'gpu-mem-calc presets' to list available presets.",
+ err=True,
+ )
+ sys.exit(1)
+
+ # Write preset to temp file for from_config_file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ json.dump(preset_config, f, indent=2)
+ temp_path = f.name
+
+ calculator = GPUMemoryCalculator.from_config_file(temp_path)
+ Path(temp_path).unlink() # Clean up temp file
+ elif config:
+ calculator = GPUMemoryCalculator.from_config_file(config)
+ else:
+ # This should never happen due to the checks above
+ click.echo("Error: Either --config or --preset is required", err=True)
+ sys.exit(1)
+
+ result = calculator.calculate()
+
+ # Format output
+ if format == "json":
+ output_text = json.dumps(result.model_dump(mode="json"), indent=2)
+ elif format == "yaml":
+ try:
+ import yaml # type: ignore[import-untyped]
+
+ output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
+ except ImportError:
+ click.echo(
+ "Error: YAML format requires PyYAML. Install with: pip install pyyaml",
+ err=True,
+ )
+ sys.exit(1)
+ else: # table
+ output_text = _format_result_as_table(result, calculator)
+
+ # Write output
+ if output:
+ Path(output).write_text(output_text)
+ click.echo(f"Results written to {output}")
+ else:
+ click.echo(output_text)
+
+ except Exception as e:
+ click.echo(f"Error: {e}", err=True)
+ sys.exit(1)
+
+
+@main.command()
+@click.argument(
+ "params",
+ type=float,
+ required=True,
+)
+@click.option(
+ "--gpus",
+ "-g",
+ type=int,
+ default=1,
+ help="Number of GPUs (default: 1)",
+)
+@click.option(
+ "--gpu-mem",
+ "-m",
+ type=float,
+ default=80.0,
+ help="GPU memory in GB (default: 80.0)",
+)
+@click.option(
+ "--engine",
+ "-e",
+ type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
+ default="pytorch",
+ help="Training engine (default: pytorch)",
+)
+@click.option(
+ "--dtype",
+ "-d",
+ type=click.Choice(["fp32", "fp16", "bf16"]),
+ default="bf16",
+ help="Data type (default: bf16)",
+)
+def quick(
+ params: float,
+ gpus: int,
+ gpu_mem: float,
+ engine: str,
+ dtype: str,
+) -> None:
+ """Quick calculation from model size (in billions of parameters).
+
+ Example:
+ gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
+ """
+ try:
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+ from gpu_mem_calculator.core.models import (
+ DType,
+ EngineConfig,
+ EngineType,
+ GPUConfig,
+ ModelConfig,
+ ParallelismConfig,
+ TrainingConfig,
+ )
+
+ # Map engine string to EngineType
+ engine_map = {
+ "pytorch": EngineType.PYTORCH_DDP,
+ "deepspeed": EngineType.DEEPSPEED,
+ "megatron": EngineType.MEGATRON_LM,
+ "fsdp": EngineType.FSDP,
+ }
+
+ # Map dtype string to DType
+ dtype_map = {
+ "fp32": DType.FP32,
+ "fp16": DType.FP16,
+ "bf16": DType.BF16,
+ }
+
+ # Create a minimal config for quick calculation
+ # Estimate model architecture from parameter count
+ # Rough approximation based on typical transformer models
+ num_params = int(params * 1e9)
+
+ # Estimate hidden size and layers from param count
+ # These are rough approximations
+ if params <= 1:
+ hidden_size, num_layers = 768, 12
+ elif params <= 7:
+ hidden_size, num_layers = 4096, 32
+ elif params <= 13:
+ hidden_size, num_layers = 5120, 40
+ elif params <= 30:
+ hidden_size, num_layers = 6656, 60
+ elif params <= 65:
+ hidden_size, num_layers = 8192, 80
+ else:
+ hidden_size, num_layers = 12288, 96
+
+ model_config = ModelConfig(
+ name="quick-estimate",
+ num_parameters=num_params,
+ num_layers=num_layers,
+ hidden_size=hidden_size,
+ num_attention_heads=hidden_size // 128,
+ vocab_size=32000,
+ max_seq_len=2048,
+ )
+
+ training_config = TrainingConfig(
+ batch_size=1,
+ gradient_accumulation_steps=1,
+ dtype=dtype_map[dtype],
+ )
+
+ parallelism_config = ParallelismConfig(data_parallel_size=gpus)
+
+ engine_config = EngineConfig(
+ type=engine_map[engine],
+ zero_stage=2 if engine == "deepspeed" else None,
+ )
+
+ gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
+
+ calculator = GPUMemoryCalculator(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ gpu_config=gpu_config,
+ )
+
+ result = calculator.calculate()
+
+ # Display results
+ click.echo(_format_result_as_table(result, calculator))
+
+ except Exception as e:
+ click.echo(f"Error: {e}", err=True)
+ sys.exit(1)
+
+
+@main.command()
+@click.argument(
+ "config_path",
+ type=click.Path(exists=True),
+)
+def validate(config_path: str) -> None:
+ """Validate a configuration file.
+
+ Example:
+ gpu-mem-calc validate configs/my_config.json
+ """
+ try:
+ from gpu_mem_calculator.config import ConfigParser
+
+ ConfigParser.parse_full_config(config_path)
+ click.echo(f"โ Configuration file '{config_path}' is valid")
+
+ except Exception as e:
+ click.echo(f"โ Validation failed: {e}", err=True)
+ sys.exit(1)
+
+
+@main.command()
+@click.option(
+ "--format",
+ "-f",
+ type=click.Choice(["list", "json", "table"]),
+ default="list",
+ help="Output format (default: list)",
+)
+def presets(format: str) -> None:
+ """List available model preset configurations.
+
+ Examples:
+ gpu-mem-calc presets
+ gpu-mem-calc presets --format table
+ gpu-mem-calc presets -f json
+ """
+ try:
+ from gpu_mem_calculator.config.presets import list_presets
+
+ all_presets = list_presets()
+
+ if not all_presets:
+ click.echo("No presets found.")
+ return
+
+ if format == "json":
+ click.echo(json.dumps(all_presets, indent=2))
+ elif format == "table":
+ from rich.console import Console
+ from rich.table import Table
+
+ console = Console()
+ table = Table(
+ title="Available Model Presets",
+ show_header=True,
+ header_style="bold magenta",
+ )
+ table.add_column("Preset Name", style="cyan", width=25)
+ table.add_column("Display Name", style="green", width=30)
+ table.add_column("Description", style="yellow")
+
+ for name, info in sorted(all_presets.items()):
+ table.add_row(name, info["display_name"], info["description"])
+
+ console.print(table)
+ else: # list format
+ click.echo("Available model presets:\n")
+ for name, info in sorted(all_presets.items()): # type: ignore[annotation-unchecked]
+ click.echo(f" {name:25} - {info['display_name']}")
+ if info.get("description"):
+ click.echo(f"{'':27}{info['description']}")
+ click.echo()
+
+ except Exception as e:
+ click.echo(f"Error: {e}", err=True)
+ sys.exit(1)
+
+
+def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
+ """Format result as ASCII table."""
+ from rich.console import Console
+ from rich.table import Table
+
+ console = Console()
+
+ # Main results table
+ table = Table(
+ title="GPU Memory Calculation Results",
+ show_header=True,
+ header_style="bold magenta",
+ )
+ table.add_column("Metric", style="cyan", width=30)
+ table.add_column("Value", style="green")
+
+ # Memory results
+ table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
+ table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
+ table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
+ table.add_row("", "") # Spacer
+
+ # Breakdown
+ table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
+ table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
+ table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
+ table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
+ table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
+ table.add_row("", "") # Spacer
+
+ # Feasibility
+ status = "โ Fits" if result.fits_on_gpu else "โ OOM"
+ table.add_row("Status", status)
+ table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
+ if result.recommended_batch_size:
+ table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
+
+ # Capture table output
+ from io import StringIO
+
+ buffer = StringIO()
+ console.file = buffer
+ console.print(table)
+ return buffer.getvalue()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/gpu_mem_calculator/config/__init__.py b/src/gpu_mem_calculator/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5fe79109caffb06ba7b153482f1555ef0247cd5
--- /dev/null
+++ b/src/gpu_mem_calculator/config/__init__.py
@@ -0,0 +1,5 @@
+"""Configuration parsing and defaults."""
+
+from gpu_mem_calculator.config.parser import ConfigParser, load_config, save_config
+
+__all__ = ["ConfigParser", "load_config", "save_config"]
diff --git a/src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6845d43ad5c81d022d051de440eea3d3d471b4d
Binary files /dev/null and b/src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc b/src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b793d40e6d97a328acf7af3d275d75075cddefa
Binary files /dev/null and b/src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc b/src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6101f11f5352897241db4893e9b64fd071dfc8f1
Binary files /dev/null and b/src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/config/parser.py b/src/gpu_mem_calculator/config/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..182b19d437f6ab1befaa129afbec5cd90d6f5f71
--- /dev/null
+++ b/src/gpu_mem_calculator/config/parser.py
@@ -0,0 +1,323 @@
+"""Configuration file parser and utilities."""
+
+import json
+from pathlib import Path
+from typing import Any, cast
+
+from pydantic import ValidationError
+
+from gpu_mem_calculator.core.models import (
+ DType,
+ EngineConfig,
+ EngineType,
+ GPUConfig,
+ ModelConfig,
+ OffloadDevice,
+ OptimizerType,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+
+class ConfigParseError(Exception):
+ """Error parsing configuration file."""
+
+ def __init__(self, message: str, errors: list[Any] | None = None):
+ super().__init__(message)
+ self.errors = errors or []
+
+
+class ConfigParser:
+ """Parse and validate configuration files."""
+
+ @staticmethod
+ def _convert_dtype(value: str) -> DType:
+ """Convert string dtype to DType enum."""
+ dtype_map = {
+ "float32": DType.FP32,
+ "fp32": DType.FP32,
+ "float16": DType.FP16,
+ "fp16": DType.FP16,
+ "bfloat16": DType.BF16,
+ "bf16": DType.BF16,
+ "int8": DType.INT8,
+ "int4": DType.INT4,
+ }
+ return dtype_map.get(value.lower(), DType.BF16)
+
+ @staticmethod
+ def _convert_optimizer(value: str) -> OptimizerType:
+ """Convert string optimizer to OptimizerType enum."""
+ opt_map = {
+ "adam": OptimizerType.ADAM,
+ "adamw": OptimizerType.ADAMW,
+ "sgd": OptimizerType.SGD,
+ "adamw_8bit": OptimizerType.ADAMW_8BIT,
+ "adamw-8bit": OptimizerType.ADAMW_8BIT,
+ }
+ return opt_map.get(value.lower(), OptimizerType.ADAMW)
+
+ @staticmethod
+ def _convert_engine(value: str) -> EngineType:
+ """Convert string engine to EngineType enum."""
+ engine_map = {
+ "pytorch": EngineType.PYTORCH_DDP,
+ "pytorch_ddp": EngineType.PYTORCH_DDP,
+ "ddp": EngineType.PYTORCH_DDP,
+ "deepspeed": EngineType.DEEPSPEED,
+ "megatron": EngineType.MEGATRON_LM,
+ "megatron_lm": EngineType.MEGATRON_LM,
+ "megatron-lm": EngineType.MEGATRON_LM,
+ "fsdp": EngineType.FSDP,
+ "megatron_deepspeed": EngineType.MEGATRON_DEEPSPEED,
+ }
+ return engine_map.get(value.lower(), EngineType.PYTORCH_DDP)
+
+ @staticmethod
+ def _convert_offload(value: str) -> OffloadDevice:
+ """Convert string offload to OffloadDevice enum."""
+ offload_map = {
+ "none": OffloadDevice.NONE,
+ "cpu": OffloadDevice.CPU,
+ "nvme": OffloadDevice.NVME,
+ }
+ return offload_map.get(value.lower(), OffloadDevice.NONE)
+
+ @staticmethod
+ def _parse_num_params(value: str | int | float) -> int:
+ """Parse number of parameters from various formats.
+
+ Supports:
+ - Raw integer: 7000000000
+ - Billions: "7B", "7b", "7e9"
+ - Millions: "7000M", "7000m", "7000e6"
+ """
+ if isinstance(value, int):
+ return value
+ if isinstance(value, float):
+ return int(value)
+
+ if isinstance(value, str):
+ value = value.strip().upper()
+
+ # Handle billions suffix
+ if value.endswith("B"):
+ return int(float(value[:-1]) * 1_000_000_000)
+
+ # Handle millions suffix
+ if value.endswith("M"):
+ return int(float(value[:-1]) * 1_000_000)
+
+ # Handle scientific notation
+ if "E" in value:
+ return int(float(value))
+
+ # Try direct conversion
+ return int(value)
+
+ raise ValueError(f"Cannot parse parameter count: {value}")
+
+ @classmethod
+ def parse_model_config(cls, data: dict[str, Any]) -> ModelConfig:
+ """Parse model configuration from dict.
+
+ Args:
+ data: Dictionary with model configuration
+
+ Returns:
+ ModelConfig object
+
+ Raises:
+ ConfigParseError: If validation fails
+ """
+ try:
+ # Convert parameter count if it's a string
+ if "num_parameters" in data and isinstance(data["num_parameters"], str):
+ data["num_parameters"] = cls._parse_num_params(data["num_parameters"])
+
+ if "largest_layer_params" in data and isinstance(data["largest_layer_params"], str):
+ data["largest_layer_params"] = cls._parse_num_params(data["largest_layer_params"])
+
+ return ModelConfig(**data)
+ except ValidationError as e:
+ raise ConfigParseError("Invalid model configuration", e.errors()) from e
+
+ @classmethod
+ def parse_training_config(cls, data: dict[str, Any]) -> TrainingConfig:
+ """Parse training configuration from dict.
+
+ Args:
+ data: Dictionary with training configuration
+
+ Returns:
+ TrainingConfig object
+
+ Raises:
+ ConfigParseError: If validation fails
+ """
+ try:
+ # Convert dtype
+ if "dtype" in data and isinstance(data["dtype"], str):
+ data["dtype"] = cls._convert_dtype(data["dtype"])
+
+ # Convert optimizer
+ if "optimizer" in data and isinstance(data["optimizer"], str):
+ data["optimizer"] = cls._convert_optimizer(data["optimizer"])
+
+ return TrainingConfig(**data)
+ except ValidationError as e:
+ raise ConfigParseError("Invalid training configuration", e.errors()) from e
+
+ @classmethod
+ def parse_parallelism_config(cls, data: dict[str, Any]) -> ParallelismConfig:
+ """Parse parallelism configuration from dict.
+
+ Args:
+ data: Dictionary with parallelism configuration
+
+ Returns:
+ ParallelismConfig object
+
+ Raises:
+ ConfigParseError: If validation fails
+ """
+ try:
+ return ParallelismConfig(**data)
+ except ValidationError as e:
+ raise ConfigParseError("Invalid parallelism configuration", e.errors()) from e
+
+ @classmethod
+ def parse_engine_config(cls, data: dict[str, Any]) -> EngineConfig:
+ """Parse engine configuration from dict.
+
+ Args:
+ data: Dictionary with engine configuration
+
+ Returns:
+ EngineConfig object
+
+ Raises:
+ ConfigParseError: If validation fails
+ """
+ try:
+ # Convert engine type
+ if "type" in data and isinstance(data["type"], str):
+ data["type"] = cls._convert_engine(data["type"])
+
+ # Convert offload options
+ if "offload_optimizer" in data and isinstance(data["offload_optimizer"], str):
+ data["offload_optimizer"] = cls._convert_offload(data["offload_optimizer"])
+
+ if "offload_param" in data and isinstance(data["offload_param"], str):
+ data["offload_param"] = cls._convert_offload(data["offload_param"])
+
+ return EngineConfig(**data)
+ except ValidationError as e:
+ raise ConfigParseError("Invalid engine configuration", e.errors()) from e
+
+ @classmethod
+ def parse_gpu_config(cls, data: dict[str, Any]) -> GPUConfig:
+ """Parse GPU configuration from dict.
+
+ Args:
+ data: Dictionary with GPU configuration
+
+ Returns:
+ GPUConfig object
+
+ Raises:
+ ConfigParseError: If validation fails
+ """
+ try:
+ return GPUConfig(**data)
+ except ValidationError as e:
+ raise ConfigParseError("Invalid GPU configuration", e.errors()) from e
+
+ @classmethod
+ def parse_file(cls, config_path: str | Path) -> dict[str, Any]:
+ """Parse configuration from JSON file.
+
+ Args:
+ config_path: Path to configuration file
+
+ Returns:
+ Dictionary with parsed configuration
+
+ Raises:
+ ConfigParseError: If file cannot be read or parsed
+ """
+ path = Path(config_path)
+ if not path.exists():
+ raise ConfigParseError(f"Configuration file not found: {config_path}")
+
+ try:
+ with path.open("r") as f:
+ data = cast(dict[str, Any], json.load(f))
+ return data
+ except json.JSONDecodeError as e:
+ raise ConfigParseError(f"Invalid JSON in configuration file: {e}") from e
+ except Exception as e:
+ raise ConfigParseError(f"Error reading configuration file: {e}") from e
+
+ @classmethod
+ def parse_full_config(
+ cls,
+ config_path: str | Path,
+ ) -> tuple[ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig]:
+ """Parse complete configuration from file.
+
+ Args:
+ config_path: Path to configuration file
+
+ Returns:
+ Tuple of (ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig)
+
+ Raises:
+ ConfigParseError: If validation fails
+ """
+ data = cls.parse_file(config_path)
+
+ try:
+ model_config = cls.parse_model_config(data.get("model", {}))
+ training_config = cls.parse_training_config(data.get("training", {}))
+ parallelism_config = cls.parse_parallelism_config(data.get("parallelism", {}))
+ engine_config = cls.parse_engine_config(data.get("engine", {}))
+ gpu_config = cls.parse_gpu_config(data.get("hardware", {}))
+
+ return (
+ model_config,
+ training_config,
+ parallelism_config,
+ engine_config,
+ gpu_config,
+ )
+ except ConfigParseError:
+ raise
+ except Exception as e:
+ raise ConfigParseError(f"Unexpected error parsing configuration: {e}") from e
+
+
+def load_config(config_path: str | Path) -> dict[str, Any]:
+ """Load configuration from file.
+
+ Args:
+ config_path: Path to configuration file
+
+ Returns:
+ Dictionary with configuration data
+ """
+ return ConfigParser.parse_file(config_path)
+
+
+def save_config(data: dict[str, Any], output_path: str | Path) -> None:
+ """Save configuration to JSON file.
+
+ Args:
+ data: Configuration dictionary to save
+ output_path: Path to save configuration file
+ """
+ path = Path(output_path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+ with path.open("w") as f:
+ json.dump(data, f, indent=2)
diff --git a/src/gpu_mem_calculator/config/presets.py b/src/gpu_mem_calculator/config/presets.py
new file mode 100644
index 0000000000000000000000000000000000000000..043183c9225ffa0c8f6d005b7b840a550c563f64
--- /dev/null
+++ b/src/gpu_mem_calculator/config/presets.py
@@ -0,0 +1,83 @@
+"""Preset model configurations loader.
+
+This module provides a centralized location for managing model preset
+configurations that can be used by both CLI and web interfaces.
+"""
+
+import json
+from pathlib import Path
+from typing import Any, cast
+
+# Base directory for the package
+BASE_DIR = Path(__file__).parent.parent.parent.parent
+
+
+def get_presets_file_path() -> Path:
+ """Get the path to the presets JSON file.
+
+ Returns:
+ Path to the presets JSON file
+ """
+ # Check for web/presets/models.json relative to project root
+ presets_path = BASE_DIR / "web" / "presets" / "models.json"
+ if presets_path.exists():
+ return presets_path
+
+ # Fallback to src directory for development installs
+ presets_path = BASE_DIR / "src" / "gpu_mem_calculator" / "presets" / "models.json"
+ return presets_path
+
+
+def load_presets() -> dict[str, dict[str, Any]]:
+ """Load all preset model configurations.
+
+ Returns:
+ Dictionary mapping preset names to their configurations.
+ Each preset has: display_name, description, config
+ """
+ presets_file = get_presets_file_path()
+
+ if not presets_file.exists():
+ return {}
+
+ try:
+ with presets_file.open("r") as f:
+ return cast(dict[str, dict[str, Any]], json.load(f))
+ except (json.JSONDecodeError, OSError):
+ return {}
+
+
+def get_preset_config(preset_name: str) -> dict[str, Any] | None:
+ """Get a specific preset configuration.
+
+ Args:
+ preset_name: Name of the preset to retrieve
+
+ Returns:
+ Preset configuration dict, or None if not found
+ """
+ presets = load_presets()
+ preset = presets.get(preset_name)
+
+ if preset is None:
+ return None
+
+ # Return just the config part (what the calculator needs)
+ return cast(dict[str, Any], preset.get("config", {}))
+
+
+def list_presets() -> dict[str, dict[str, str]]:
+ """List all available presets with metadata.
+
+ Returns:
+ Dictionary mapping preset names to their display metadata.
+ Each entry has: display_name, description
+ """
+ presets = load_presets()
+ return {
+ name: {
+ "display_name": preset.get("display_name", name),
+ "description": preset.get("description", ""),
+ }
+ for name, preset in presets.items()
+ }
diff --git a/src/gpu_mem_calculator/core/__init__.py b/src/gpu_mem_calculator/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ce84261ff34549a4bdd55250a7b39ad9f51226
--- /dev/null
+++ b/src/gpu_mem_calculator/core/__init__.py
@@ -0,0 +1,24 @@
+"""Core memory calculation models and formulas."""
+
+from gpu_mem_calculator.core.formulas import Precision
+from gpu_mem_calculator.core.models import (
+ EngineConfig,
+ EngineType,
+ GPUConfig,
+ ModelConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+__all__ = [
+ "ModelConfig",
+ "TrainingConfig",
+ "ParallelismConfig",
+ "EngineConfig",
+ "EngineType",
+ "GPUConfig",
+ "Precision",
+]
+
+# Import GPUMemoryCalculator separately to avoid circular import
+# Use: from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
diff --git a/src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f36a3e0ca26299eac66e2ce946fa1e4001d658b
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f23111875c25e5c6e16ba9043021e526c71899d4
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b34dac9fa4f2a4fc6a2beacc8d68bdf6e262383b
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c91a728c36b563dace22e719313ad40c450cc23f
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8cf9aad04dac7f175f6676b17fc7c3e50490c09
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/calculator.py b/src/gpu_mem_calculator/core/calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d894a8b97ca7d4ae2268b175c6bf42f59302a6
--- /dev/null
+++ b/src/gpu_mem_calculator/core/calculator.py
@@ -0,0 +1,178 @@
+"""Main GPU memory calculator.
+
+Orchestrates the memory calculation by selecting the appropriate
+training engine and aggregating results.
+"""
+
+from gpu_mem_calculator.config.parser import ConfigParser
+from gpu_mem_calculator.core.models import (
+ EngineConfig,
+ EngineType,
+ GPUConfig,
+ MemoryResult,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+from gpu_mem_calculator.engines import (
+ DeepSpeedEngine,
+ FSDPEngine,
+ MegatronDeepSpeedEngine,
+ MegatronLMEngine,
+ PyTorchDDPEngine,
+)
+
+# Type alias for engine types
+EngineTypeAlias = (
+ PyTorchDDPEngine | DeepSpeedEngine | MegatronLMEngine | FSDPEngine | MegatronDeepSpeedEngine
+)
+
+
+class GPUMemoryCalculator:
+ """Main GPU memory calculator.
+
+ This class provides a high-level interface for calculating
+ GPU memory requirements for LLM training.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig | None = None,
+ engine_config: EngineConfig | None = None,
+ gpu_config: GPUConfig | None = None,
+ node_config: NodeConfig | None = None,
+ ) -> None:
+ """Initialize the calculator.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings (default: no parallelism)
+ engine_config: Training engine configuration (default: PyTorch DDP)
+ gpu_config: Hardware configuration (default: 1x 80GB GPU)
+ node_config: Multi-node configuration (default: single node)
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config or ParallelismConfig()
+ self.engine_config = engine_config or EngineConfig()
+ self.gpu_config = gpu_config or GPUConfig()
+ self.node_config = node_config or NodeConfig()
+
+ def calculate(self) -> MemoryResult:
+ """Calculate GPU memory requirements.
+
+ Selects the appropriate training engine based on configuration
+ and returns the memory calculation result.
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ engine = self._get_engine()
+ return engine.calculate_memory()
+
+ def _get_engine(self) -> EngineTypeAlias:
+ """Get the appropriate training engine instance.
+
+ Returns:
+ Engine instance configured with current settings
+ """
+ match self.engine_config.type:
+ case EngineType.PYTORCH_DDP:
+ return PyTorchDDPEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ node_config=self.node_config,
+ )
+ case EngineType.DEEPSPEED:
+ return DeepSpeedEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ node_config=self.node_config,
+ )
+ case EngineType.MEGATRON_LM:
+ return MegatronLMEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ node_config=self.node_config,
+ )
+ case EngineType.FSDP:
+ return FSDPEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ node_config=self.node_config,
+ )
+ case EngineType.MEGATRON_DEEPSPEED:
+ return MegatronDeepSpeedEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ node_config=self.node_config,
+ )
+ case _:
+ # Default to PyTorch DDP
+ return PyTorchDDPEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ node_config=self.node_config,
+ )
+
+ @classmethod
+ def from_config_file(
+ cls,
+ config_path: str,
+ ) -> "GPUMemoryCalculator":
+ """Create calculator from configuration file.
+
+ Args:
+ config_path: Path to JSON configuration file
+
+ Returns:
+ Configured GPUMemoryCalculator instance
+ """
+ model_config, training_config, parallelism_config, engine_config, gpu_config = (
+ ConfigParser.parse_full_config(config_path)
+ )
+
+ return cls(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ gpu_config=gpu_config,
+ )
+
+ def to_dict(self) -> dict:
+ """Export calculator configuration to dictionary.
+
+ Returns:
+ Dictionary with all configuration
+ """
+ return {
+ "model": self.model_config.model_dump(),
+ "training": self.training_config.model_dump(),
+ "parallelism": self.parallelism_config.model_dump(),
+ "engine": self.engine_config.model_dump(),
+ "hardware": self.gpu_config.model_dump(),
+ "multinode": self.node_config.model_dump(),
+ }
diff --git a/src/gpu_mem_calculator/core/formulas.py b/src/gpu_mem_calculator/core/formulas.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f14a2003ba8684934f34151956a925538dd89f
--- /dev/null
+++ b/src/gpu_mem_calculator/core/formulas.py
@@ -0,0 +1,268 @@
+"""Memory calculation formulas.
+
+This module contains the fundamental formulas for calculating GPU memory
+requirements for LLM training.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class Precision:
+ """Precision information for a data type.
+
+ This is re-exported from utils.precision for convenience.
+ """
+
+ name: str
+ bits_per_param: int
+ bytes_per_param: float
+ is_integer: bool = False
+
+
+def calculate_parameter_memory(
+ num_params: int,
+ dtype: str,
+ num_gpus: int = 1,
+) -> float:
+ """Calculate memory in GB for model parameters.
+
+ Args:
+ num_params: Number of model parameters
+ dtype: Data type (e.g., "fp32", "fp16", "bf16", "int8", "int4")
+ num_gpus: Number of GPUs for distribution
+
+ Returns:
+ Memory in GB
+ """
+ from gpu_mem_calculator.utils.precision import gb_from_params
+
+ # Parameters are distributed across GPUs in data parallel training
+ # But for tensor/pipeline parallel, each GPU holds a portion
+ # We'll handle parallelism in the engine implementations
+ return gb_from_params(num_params, dtype)
+
+
+def calculate_gradient_memory(
+ num_params: int,
+ dtype: str,
+) -> float:
+ """Calculate memory in GB for gradients.
+
+ Gradients are typically stored in the same precision as parameters
+ for training (though updated in FP32).
+
+ Args:
+ num_params: Number of model parameters
+ dtype: Data type for gradients
+
+ Returns:
+ Memory in GB
+ """
+ from gpu_mem_calculator.utils.precision import gb_from_params
+
+ # Gradients are same size as parameters during training
+ return gb_from_params(num_params, dtype)
+
+
+def calculate_optimizer_memory(
+ num_params: int,
+ optimizer: str,
+) -> float:
+ """Calculate memory in GB for optimizer states.
+
+ Args:
+ num_params: Number of model parameters
+ optimizer: Optimizer type (adam, adamw, sgd, adamw_8bit)
+
+ Returns:
+ Memory in GB (for FP32 optimizer states)
+ """
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+ # Optimizer states are typically stored in FP32
+ # bytes_per_param = 4.0 # FP32
+
+ match optimizer.lower():
+ case "adam" | "adamw":
+ # Adam/AdamW optimizer states: 12 bytes per param
+ # - FP32 parameter copy: 4 bytes
+ # - Momentum (fp32): 4 bytes
+ # - Variance (fp32): 4 bytes
+ # Reference: https://blog.eleuther.ai/transformer-math/#optimizer-states
+ # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+ # Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+ optimizer_bytes_per_param = 12.0
+ case "adamw_8bit":
+ # 8-bit Adam: ~2 bytes per param (quantized states)
+ # Reference: bitsandbytes 8-bit optimizer
+ optimizer_bytes_per_param = 2.0
+ case "sgd":
+ # SGD: momentum (4 bytes) if using momentum, 0 if not
+ # Assuming momentum is used
+ optimizer_bytes_per_param = 4.0
+ case _:
+ # Default to Adam
+ optimizer_bytes_per_param = 12.0
+
+ total_bytes = num_params * optimizer_bytes_per_param
+ return gb_from_bytes(total_bytes)
+
+
+def calculate_activation_memory(
+ batch_size: int,
+ seq_len: int,
+ hidden_size: int,
+ num_layers: int,
+ num_attention_heads: int,
+ tensor_parallel_size: int = 1,
+ activation_checkpointing: int = 0,
+ moe_enabled: bool = False,
+ num_experts: int = 1,
+ top_k: int = 1,
+ expert_intermediate_size: int | None = None,
+) -> float:
+ """Calculate approximate memory in GB for activations.
+
+ This provides an estimate based on transformer architecture. Actual
+ activation memory depends on many factors including the specific
+ model implementation and framework.
+
+ Reference: https://blog.eleuther.ai/transformer-math/#activations
+ Reference: https://arxiv.org/abs/2204.13323 ("Reducing Activation Recomputation
+ in Large Transformer Models")
+
+ According to EleutherAI Transformer Math 101, for selective activation
+ checkpointing (the most common approach), the formula is:
+
+ sbhL(10 + 24/t) bytes
+
+ Where:
+ - s = sequence length (seq_len)
+ - b = batch size (batch_size)
+ - h = hidden size (hidden_size)
+ - L = number of layers (num_layers)
+ - t = tensor parallel size (tensor_parallel_size)
+
+ This implementation uses a simplified heuristic that approximates
+ this formula: hidden_size * 16 bytes per token per layer. This
+ provides a reasonable estimate for typical model configurations
+ while being simple to understand and modify.
+
+ For MoE models, activation memory is reduced because only top_k experts
+ are active per token, not all experts.
+
+ Args:
+ batch_size: Batch size per GPU
+ seq_len: Sequence length
+ hidden_size: Hidden dimension size
+ num_layers: Number of transformer layers
+ num_attention_heads: Number of attention heads
+ tensor_parallel_size: Tensor parallelism degree
+ activation_checkpointing: Checkpointing level (0-4)
+ moe_enabled: Whether model uses Mixture of Experts
+ num_experts: Total number of experts (for MoE)
+ top_k: Number of active experts per token (for MoE)
+ expert_intermediate_size: Expert intermediate layer size (for MoE)
+
+ Returns:
+ Memory in GB
+ """
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+ # Approximate activation memory per token per layer
+ # Based on EleutherAI formula: sbhL(10 + 24/t)
+ # For t=1: ~10-24 bytes per token per layer depending on architecture
+ # We use 16 as a middle-ground estimate
+ # This includes attention outputs, MLP activations, layer norms, etc.
+
+ bytes_per_token_per_layer = hidden_size * 16 # Heuristic estimate
+
+ # For MoE models, adjust activation memory based on active experts
+ moe_multiplier = 1.0
+ if moe_enabled and num_experts > 1:
+ # Only top_k experts are active per token
+ # Base ratio of active experts
+ expert_ratio = top_k / num_experts
+
+ # Add router overhead (gating network activations)
+ router_overhead = 0.1
+
+ moe_multiplier = min(1.0, expert_ratio + router_overhead)
+
+ # For MoE, experts typically have larger intermediate sizes
+ if moe_enabled and expert_intermediate_size:
+ # Scale up slightly for larger expert intermediate layers
+ # Typical expert intermediate size is 4x hidden_size (vs 2x for dense)
+ size_ratio = expert_intermediate_size / (hidden_size * 2)
+ moe_multiplier *= min(2.0, size_ratio) # Cap at 2x increase
+
+ # Total activation memory
+ total_bytes = (
+ batch_size
+ * seq_len
+ * num_layers
+ * bytes_per_token_per_layer
+ * moe_multiplier
+ / tensor_parallel_size
+ )
+
+ # Adjust for activation checkpointing
+ # Level 0: No checkpointing (100% memory)
+ # Level 1: Checkpoint attention output (~80% memory)
+ # Level 2: Checkpoint attention input (~60% memory)
+ # Level 3: Checkpoint more (~40% memory)
+ # Level 4: Full checkpointing (~20% memory)
+ checkpoint_factors = [1.0, 0.8, 0.6, 0.4, 0.2]
+ checkpoint_factor = checkpoint_factors[min(activation_checkpointing, 4)]
+
+ total_bytes *= checkpoint_factor
+
+ return gb_from_bytes(total_bytes)
+
+
+def calculate_overhead(
+ total_memory: float,
+ overhead_factor: float = 0.2,
+) -> float:
+ """Calculate additional memory overhead.
+
+ This accounts for CUDA context, fragmentation, temporary buffers, etc.
+
+ Args:
+ total_memory: Total calculated memory in GB
+ overhead_factor: Fraction to add for overhead (default 20%)
+
+ Returns:
+ Overhead memory in GB
+ """
+ return total_memory * overhead_factor
+
+
+def estimate_largest_layer_params(
+ hidden_size: int,
+ num_attention_heads: int,
+ intermediate_size: int | None = None,
+) -> int:
+ """Estimate the largest layer parameters for ZeRO-3 calculations.
+
+ The largest layer is typically the MLP layer or attention projection.
+
+ Args:
+ hidden_size: Hidden dimension size
+ num_attention_heads: Number of attention heads
+ intermediate_size: MLP intermediate size (default 4 * hidden_size)
+
+ Returns:
+ Estimated number of parameters in the largest layer
+ """
+ if intermediate_size is None:
+ intermediate_size = 4 * hidden_size
+
+ # MLP layer: hidden_size * intermediate_size * 2 (for up and down projections)
+ mlp_params = hidden_size * intermediate_size * 2
+
+ # Attention output projection: hidden_size * hidden_size
+ attn_params = hidden_size * hidden_size
+
+ return max(mlp_params, attn_params)
diff --git a/src/gpu_mem_calculator/core/models.py b/src/gpu_mem_calculator/core/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..53266999d8990cefbe1465e3e9379e4620e9f03a
--- /dev/null
+++ b/src/gpu_mem_calculator/core/models.py
@@ -0,0 +1,568 @@
+"""Data models for GPU memory calculation."""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Literal, cast
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic_core.core_schema import ValidationInfo as FieldValidationInfo
+
+
+class EngineType(str, Enum):
+ """Supported training engine types."""
+
+ PYTORCH_DDP = "pytorch_ddp"
+ DEEPSPEED = "deepspeed"
+ MEGATRON_LM = "megatron_lm"
+ FSDP = "fsdp"
+ MEGATRON_DEEPSPEED = "megatron_deepspeed"
+
+
+class InferenceEngineType(str, Enum):
+ """Supported inference engine types."""
+
+ HUGGINGFACE = "huggingface"
+ VLLM = "vllm"
+ TGI = "tgi"
+ TENSORRT_LLM = "tensorrt_llm"
+ TRTLLM = "trtllm"
+ SGLANG = "sglang"
+
+
+class OptimizerType(str, Enum):
+ """Supported optimizer types."""
+
+ ADAM = "adam"
+ ADAMW = "adamw"
+ SGD = "sgd"
+ ADAMW_8BIT = "adamw_8bit"
+
+
+class DType(str, Enum):
+ """Supported data types."""
+
+ FP32 = "fp32"
+ FP16 = "fp16"
+ BF16 = "bf16"
+ INT8 = "int8"
+ INT4 = "int4"
+
+
+class OffloadDevice(str, Enum):
+ """CPU offload options."""
+
+ NONE = "none"
+ CPU = "cpu"
+ NVME = "nvme"
+
+
+class ModelConfig(BaseModel):
+ """Model architecture configuration."""
+
+ name: str = Field(default="custom", description="Model name")
+ num_parameters: int = Field(gt=0, description="Total number of parameters")
+ num_layers: int = Field(gt=0, description="Number of transformer layers")
+ hidden_size: int = Field(gt=0, description="Hidden dimension size")
+ num_attention_heads: int = Field(gt=0, description="Number of attention heads")
+ vocab_size: int = Field(default=32000, gt=0, description="Vocabulary size")
+ max_seq_len: int = Field(default=2048, gt=0, description="Maximum sequence length")
+ largest_layer_params: int | None = Field(
+ default=None,
+ gt=0,
+ description="Largest layer parameters (auto-calculated if not provided)",
+ )
+
+ # MoE (Mixture of Experts) parameters
+ moe_enabled: bool = Field(default=False, description="Enable Mixture of Experts")
+ num_experts: int = Field(default=8, ge=1, description="Number of experts in MoE")
+ top_k: int = Field(default=2, ge=1, description="Number of experts activated per token (top-k)")
+ expert_intermediate_size: int | None = Field(
+ default=None,
+ gt=0,
+ description="Expert intermediate layer size (defaults to 4x hidden_size)",
+ )
+ shared_expert_intermediate_size: int | None = Field(
+ default=None,
+ gt=0,
+ description="Shared expert intermediate size (for models like GLM with shared experts)",
+ )
+
+ @model_validator(mode="after")
+ def calculate_largest_layer(self) -> ModelConfig:
+ """Calculate largest layer params if not provided."""
+ if self.largest_layer_params is not None:
+ return self
+ # Calculate it
+ hidden = self.hidden_size
+ moe_enabled = self.moe_enabled
+
+ if hidden and moe_enabled:
+ # For MoE: largest layer includes expert parameters
+ expert_intermediate = self.expert_intermediate_size or hidden * 4
+ self.largest_layer_params = int(hidden * expert_intermediate * 2)
+ elif hidden:
+ # Dense model: attention output + MLP
+ self.largest_layer_params = int(hidden * hidden * 4)
+ return self
+
+ @property
+ def effective_num_experts(self) -> int:
+ """Get effective number of experts (returns 1 if MoE disabled)."""
+ return self.num_experts if self.moe_enabled else 1
+
+ @property
+ def active_experts(self) -> int:
+ """Get number of active experts per token (top_k or 1 if dense)."""
+ return self.top_k if self.moe_enabled else 1
+
+
+class TrainingConfig(BaseModel):
+ """Training hyperparameters configuration."""
+
+ batch_size: int = Field(default=1, gt=0, description="Batch size per GPU")
+ gradient_accumulation_steps: int = Field(
+ default=1,
+ gt=0,
+ description="Gradient accumulation steps",
+ )
+ optimizer: OptimizerType = Field(default=OptimizerType.ADAMW, description="Optimizer type")
+ dtype: DType = Field(default=DType.BF16, description="Data type for training")
+ activation_checkpointing: int = Field(
+ default=0,
+ ge=0,
+ le=4,
+ description="Activation checkpointing level (0-4)",
+ )
+
+ @property
+ def effective_batch_size(self) -> int:
+ """Calculate effective batch size with gradient accumulation."""
+ return self.batch_size * self.gradient_accumulation_steps
+
+
+class ParallelismConfig(BaseModel):
+ """Parallelism configuration."""
+
+ tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
+ pipeline_parallel_size: int = Field(default=1, ge=1, description="Pipeline parallelism degree")
+ data_parallel_size: int = Field(default=1, ge=1, description="Data parallelism degree")
+ sequence_parallel: bool = Field(default=False, description="Enable sequence parallelism")
+
+ @property
+ def total_parallel_size(self) -> int:
+ """Calculate total parallelism degree."""
+ return self.tensor_parallel_size * self.pipeline_parallel_size * self.data_parallel_size
+
+
+class EngineConfig(BaseModel):
+ """Training engine specific configuration."""
+
+ type: EngineType = Field(default=EngineType.PYTORCH_DDP, description="Training engine type")
+ zero_stage: int | None = Field(
+ default=None,
+ ge=0,
+ le=3,
+ description="DeepSpeed ZeRO stage (only for DeepSpeed engine)",
+ )
+ offload_optimizer: OffloadDevice = Field(
+ default=OffloadDevice.NONE,
+ description="CPU offload for optimizer states",
+ )
+ offload_param: OffloadDevice = Field(
+ default=OffloadDevice.NONE,
+ description="CPU offload for parameters",
+ )
+ zero_init: bool = Field(
+ default=True,
+ description="Use ZeRO initialization (only for DeepSpeed ZeRO-3)",
+ )
+ sharding_strategy: Literal["no_shard", "shard_grad_op", "full_shard"] = Field(
+ default="full_shard",
+ description="FSDP sharding strategy",
+ )
+
+
+class GPUConfig(BaseModel):
+ """Hardware configuration."""
+
+ num_gpus: int = Field(default=1, ge=1, description="Number of GPUs")
+ gpu_memory_gb: float = Field(default=80.0, gt=0, description="GPU memory in GB")
+ total_gpu_memory_gb: float | None = Field(
+ default=None,
+ description="Total GPU memory (calculated if not provided)",
+ )
+
+ @field_validator("total_gpu_memory_gb")
+ @classmethod
+ def calculate_total_memory(cls, v: float | None, info: FieldValidationInfo) -> float | None:
+ """Calculate total GPU memory if not provided."""
+ if v is None:
+ num_gpus = cast(int, info.data.get("num_gpus", 1))
+ gpu_mem = cast(float, info.data.get("gpu_memory_gb", 80.0))
+ return num_gpus * gpu_mem
+ return v
+
+
+class InterconnectType(str, Enum):
+ """Multi-node interconnect types."""
+
+ INFINIBAND = "infiniband"
+ NVLINK = "nvlink"
+ ETHERNET_10G = "ethernet_10g"
+ ETHERNET_25G = "ethernet_25g"
+ ETHERNET_100G = "ethernet_100g"
+ ETHERNET_200G = "ethernet_200g"
+
+
+class NodeConfig(BaseModel):
+ """Multi-node configuration."""
+
+ num_nodes: int = Field(default=1, ge=1, description="Number of nodes")
+ gpus_per_node: int | None = Field(
+ default=None,
+ ge=1,
+ description="GPUs per node (calculated from num_gpus if not provided)",
+ )
+ interconnect_type: InterconnectType = Field(
+ default=InterconnectType.INFINIBAND,
+ description="Interconnect type between nodes",
+ )
+ interconnect_bandwidth_gbps: float | None = Field(
+ default=None,
+ gt=0,
+ description="Interconnect bandwidth in Gbps (default: auto from type)",
+ )
+
+ @field_validator("gpus_per_node")
+ @classmethod
+ def calculate_gpus_per_node(cls, v: int | None, info: FieldValidationInfo) -> int | None:
+ """Calculate GPUs per node if not provided."""
+ if v is None:
+ num_nodes = cast(int, info.data.get("num_nodes", 1))
+ num_gpus = cast(int, info.data.get("num_gpus", 1))
+ return max(1, num_gpus // num_nodes)
+ return v
+
+ def get_interconnect_bandwidth_gbps(self) -> float:
+ """Get interconnect bandwidth in Gbps.
+
+ Returns bandwidth from config or default based on interconnect type.
+ """
+ if self.interconnect_bandwidth_gbps:
+ return self.interconnect_bandwidth_gbps
+
+ # Default bandwidth values for each interconnect type
+ bandwidth_defaults = {
+ InterconnectType.INFINIBAND: 200.0, # HDR200 InfiniBand
+ InterconnectType.NVLINK: 300.0, # NVLink/NVSwitch
+ InterconnectType.ETHERNET_10G: 10.0,
+ InterconnectType.ETHERNET_25G: 25.0,
+ InterconnectType.ETHERNET_100G: 100.0,
+ InterconnectType.ETHERNET_200G: 200.0,
+ }
+ return bandwidth_defaults.get(self.interconnect_type, 100.0)
+
+ @property
+ def is_multi_node(self) -> bool:
+ """Check if this is a multi-node configuration."""
+ return self.num_nodes > 1
+
+
+class NetworkOverhead(BaseModel):
+ """Network communication overhead for multi-node training."""
+
+ allreduce_gb: float = Field(default=0.0, ge=0, description="AllReduce communication in GB")
+ allgather_gb: float = Field(default=0.0, ge=0, description="AllGather communication in GB")
+ reducescatter_gb: float = Field(
+ default=0.0, ge=0, description="ReduceScatter communication in GB"
+ )
+ point_to_point_gb: float = Field(
+ default=0.0, ge=0, description="Point-to-point communication in GB"
+ )
+ total_overhead_gb: float = Field(default=0.0, ge=0, description="Total network overhead in GB")
+ estimated_overhead_ms_per_step: float | None = Field(
+ default=None,
+ description="Estimated communication overhead per training step in milliseconds",
+ )
+
+
+class HybridParallelismConfig(BaseModel):
+ """Hybrid parallelism configuration for optimal multi-node scaling."""
+
+ auto_optimize: bool = Field(
+ default=False,
+ description="Automatically optimize parallelism strategy for given hardware",
+ )
+ target_gpu_utilization: float = Field(
+ default=0.85,
+ gt=0.0,
+ le=1.0,
+ description="Target GPU memory utilization (0.0-1.0)",
+ )
+ prefer_pipeline_parallel: bool = Field(
+ default=False,
+ description="Prefer pipeline parallelism over data parallel for multi-node",
+ )
+ max_pipeline_chunks: int | None = Field(
+ default=None,
+ ge=1,
+ description="Maximum number of pipeline chunks (virtual stages)",
+ )
+ enable_sequence_parallel: bool = Field(
+ default=True,
+ description="Enable sequence parallelism for long sequences",
+ )
+ sequence_parallel_threshold: int = Field(
+ default=4096,
+ ge=1,
+ description="Sequence length threshold for enabling sequence parallel",
+ )
+
+
+class MemoryBreakdown(BaseModel):
+ """Memory calculation result breakdown."""
+
+ model_config = ConfigDict(protected_namespaces=())
+
+ model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
+ gradients_gb: float = Field(ge=0, description="Gradients memory in GB")
+ optimizer_states_gb: float = Field(ge=0, description="Optimizer states memory in GB")
+ activations_gb: float = Field(ge=0, description="Activations memory in GB")
+ overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
+
+ @property
+ def total_memory_gb(self) -> float:
+ """Total memory in GB."""
+ return (
+ self.model_params_gb
+ + self.gradients_gb
+ + self.optimizer_states_gb
+ + self.activations_gb
+ + self.overhead_gb
+ )
+
+
+class MemoryResult(BaseModel):
+ """Complete memory calculation result."""
+
+ total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
+ total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
+ cpu_memory_gb: float = Field(default=0.0, ge=0, description="CPU memory required in GB")
+ breakdown: MemoryBreakdown = Field(description="Memory breakdown by component")
+ network_overhead: NetworkOverhead | None = Field(
+ default=None,
+ description="Network communication overhead for multi-node training",
+ )
+ fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
+ memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
+ recommended_batch_size: int | None = Field(
+ default=None,
+ description="Recommended batch size if current doesn't fit",
+ )
+ multi_node_info: dict | None = Field(
+ default=None,
+ description="Additional multi-node configuration info",
+ )
+
+
+class KVCacheQuantization(str, Enum):
+ """KV cache quantization options."""
+
+ NONE = "none"
+ INT8 = "int8"
+ FP8 = "fp8"
+ INT4 = "int4"
+
+
+class InferenceMemoryBreakdown(BaseModel):
+ """Memory breakdown for inference workloads."""
+
+ model_config = ConfigDict(protected_namespaces=())
+
+ model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
+ kv_cache_gb: float = Field(ge=0, description="KV cache memory in GB")
+ activations_gb: float = Field(ge=0, description="Activation memory in GB")
+ overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
+
+ @property
+ def total_memory_gb(self) -> float:
+ """Total memory in GB."""
+ return self.model_params_gb + self.kv_cache_gb + self.activations_gb + self.overhead_gb
+
+
+class InferenceConfig(BaseModel):
+ """Inference-specific configuration."""
+
+ batch_size: int = Field(default=1, gt=0, description="Batch size for inference")
+ max_seq_len: int | None = Field(
+ default=None,
+ gt=0,
+ description="Override max sequence length for inference (default: use model config)",
+ )
+ kv_cache_quantization: KVCacheQuantization = Field(
+ default=KVCacheQuantization.NONE,
+ description="KV cache quantization type",
+ )
+ use_kv_cache: bool = Field(default=True, description="Enable KV cache for generation")
+ tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
+ enable_streaming: bool = Field(default=False, description="Enable streaming inference")
+
+ # Common inference options
+ gpu_memory_utilization: float = Field(
+ default=0.9,
+ gt=0.0,
+ le=1.0,
+ description="GPU memory utilization target (0.0-1.0)",
+ )
+
+ # TGI-specific options
+ max_total_tokens: int | None = Field(
+ default=None,
+ gt=0,
+ description="TGI: Maximum total tokens (input + output) - defines memory budget",
+ )
+ max_input_tokens: int | None = Field(
+ default=None,
+ gt=0,
+ description="TGI: Maximum input tokens",
+ )
+ max_batch_total_tokens: int | None = Field(
+ default=None,
+ gt=0,
+ description="TGI: Maximum total tokens across all batches",
+ )
+ tgi_quantize: Literal[
+ "none",
+ "awq",
+ "eetq",
+ "exl2",
+ "gptq",
+ "marlin",
+ "bitsandbytes",
+ "bitsandbytes-nf4",
+ "bitsandbytes-fp4",
+ "fp8",
+ ] = Field(
+ default="none",
+ description="TGI: Weight quantization method",
+ )
+ tgi_dtype: Literal["float16", "bfloat16"] = Field(
+ default="bfloat16",
+ description="TGI: Data type for inference",
+ )
+ sharded: bool = Field(default=False, description="TGI: Enable sharded inference")
+ num_shard: int | None = Field(
+ default=None,
+ ge=1,
+ description="TGI: Number of shards for sharded inference",
+ )
+
+ # vLLM-specific options
+ block_size: int | None = Field(
+ default=None,
+ ge=1,
+ description="vLLM: Block size for KV cache management (default: 16)",
+ )
+ swap_space_gb: float = Field(default=0.0, ge=0.0, description="vLLM: CPU swap space in GB")
+ enable_prefix_caching: bool = Field(default=False, description="vLLM: Enable prefix caching")
+ enforce_eager: bool = Field(
+ default=False,
+ description="vLLM: Enable eager mode (disable CUDA graph)",
+ )
+ max_num_batched_tokens: int | None = Field(
+ default=None,
+ gt=0,
+ description="vLLM: Maximum number of batched tokens",
+ )
+ max_num_seqs: int | None = Field(
+ default=None,
+ gt=0,
+ description="vLLM: Maximum number of sequences in a batch",
+ )
+ vllm_quantization: Literal["none", "awq", "gptq", "squeezellm", "fp8"] = Field(
+ default="none",
+ description="vLLM: Weight quantization method",
+ )
+
+ # TensorRT-LLM-specific options
+ trt_max_batch_size: int | None = Field(
+ default=None,
+ gt=0,
+ description="TensorRT-LLM: Maximum batch size",
+ )
+ trt_max_input_len: int | None = Field(
+ default=None,
+ gt=0,
+ description="TensorRT-LLM: Maximum input length",
+ )
+ trt_max_seq_len: int | None = Field(
+ default=None,
+ gt=0,
+ description="TensorRT-LLM: Maximum sequence length",
+ )
+ trt_max_beam_width: int | None = Field(
+ default=None,
+ ge=1,
+ description="TensorRT-LLM: Maximum beam width for beam search",
+ )
+
+ # SGLang-specific options
+ chunk_size: int | None = Field(
+ default=None,
+ ge=1,
+ description="SGLang: Prefill chunk size for long contexts (default: 8192)",
+ )
+ max_running_requests: int | None = Field(
+ default=None,
+ ge=1,
+ description="SGLang: Maximum number of concurrent requests",
+ )
+ disable_radix_cache: bool = Field(
+ default=False,
+ description="SGLang: Disable RadixAttention cache (for debugging)",
+ )
+ enable_p2p: bool = Field(
+ default=False,
+ description="SGLang: Enable P2P attention for multi-GPU",
+ )
+ disable_custom_all_reduce: bool = Field(
+ default=False,
+ description="SGLang: Disable custom all-reduce kernel",
+ )
+ attention_backend: Literal["flashinfer", "triton", "torch"] = Field(
+ default="flashinfer",
+ description="SGLang: Attention backend implementation",
+ )
+ enable_torch_compile: bool = Field(
+ default=False,
+ description="SGLang: Enable torch.compile for model optimization",
+ )
+ radix_cache_max_seq_len: int | None = Field(
+ default=None,
+ gt=0,
+ description="SGLang: Maximum sequence length for RadixCache",
+ )
+ speculative_algo: Literal["default", "medusa", "eagle"] = Field(
+ default="default",
+ description="SGLang: Speculative decoding algorithm",
+ )
+ multi_lora_enabled: bool = Field(default=False, description="SGLang: Enable multi-LoRA serving")
+
+
+class InferenceMemoryResult(BaseModel):
+ """Inference memory calculation result."""
+
+ total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
+ total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
+ breakdown: InferenceMemoryBreakdown = Field(description="Memory breakdown by component")
+ fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
+ memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
+ max_supported_batch_size: int | None = Field(
+ default=None,
+ description="Maximum batch size that fits in GPU memory",
+ )
+ estimated_throughput_tokens_per_sec: float | None = Field(
+ default=None,
+ description="Estimated throughput in tokens/second",
+ )
diff --git a/src/gpu_mem_calculator/core/multinode.py b/src/gpu_mem_calculator/core/multinode.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b6b02fd333438fb5eeb65154c06ac2da023b89
--- /dev/null
+++ b/src/gpu_mem_calculator/core/multinode.py
@@ -0,0 +1,308 @@
+"""Multi-node training calculator.
+
+Handles network communication overhead calculation and hybrid
+parallelism optimization for multi-node training configurations.
+"""
+
+from gpu_mem_calculator.core.models import (
+ EngineConfig,
+ EngineType,
+ HybridParallelismConfig,
+ ModelConfig,
+ NetworkOverhead,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+
+class MultiNodeCalculator:
+ """Calculator for multi-node training overhead and optimization.
+
+ This class provides:
+ - Network communication overhead estimation
+ - Hybrid parallelism strategy optimization
+ - Multi-node performance modeling
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig,
+ node_config: NodeConfig,
+ engine_config: EngineConfig,
+ ) -> None:
+ """Initialize the multi-node calculator.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings
+ node_config: Multi-node hardware configuration
+ engine_config: Training engine configuration
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config
+ self.node_config = node_config
+ self.engine_config = engine_config
+
+ def calculate_network_overhead(self) -> NetworkOverhead:
+ """Calculate network communication overhead for multi-node training.
+
+ Estimates communication overhead for different collective operations
+ based on model size, parallelism strategy, and interconnect bandwidth.
+
+ Returns:
+ NetworkOverhead with detailed breakdown
+ """
+ if not self.node_config.is_multi_node:
+ return NetworkOverhead()
+
+ # Get model size in bytes
+ model_params = self.model_config.num_parameters
+ dtype_bytes = self._get_dtype_bytes()
+ model_size_bytes = int(model_params * dtype_bytes)
+
+ # Calculate communication for each collective operation
+ allreduce_gb = self._calculate_allreduce_overhead(model_size_bytes)
+ allgather_gb = self._calculate_allgather_overhead(model_size_bytes)
+ reducescatter_gb = self._calculate_reducescatter_overhead(model_size_bytes)
+ point_to_point_gb = self._calculate_pipeline_overhead(model_size_bytes)
+
+ total_overhead_gb = allreduce_gb + allgather_gb + reducescatter_gb + point_to_point_gb
+
+ # Estimate time overhead per step
+ overhead_ms = self._estimate_communication_time_ms(total_overhead_gb)
+
+ return NetworkOverhead(
+ allreduce_gb=allreduce_gb,
+ allgather_gb=allgather_gb,
+ reducescatter_gb=reducescatter_gb,
+ point_to_point_gb=point_to_point_gb,
+ total_overhead_gb=total_overhead_gb,
+ estimated_overhead_ms_per_step=overhead_ms,
+ )
+
+ def optimize_hybrid_parallelism(
+ self,
+ hybrid_config: HybridParallelismConfig,
+ ) -> ParallelismConfig:
+ """Optimize hybrid parallelism strategy for multi-node training.
+
+ Analyzes the hardware configuration and model characteristics
+ to recommend optimal parallelism degrees.
+
+ Args:
+ hybrid_config: Hybrid parallelism configuration and preferences
+
+ Returns:
+ Optimized ParallelismConfig
+ """
+ if not hybrid_config.auto_optimize:
+ return self.parallelism_config
+
+ num_nodes = self.node_config.num_nodes
+ gpus_per_node = self.node_config.gpus_per_node or 1
+ total_gpus = num_nodes * gpus_per_node
+
+ seq_len = self.model_config.max_seq_len
+
+ # Determine optimal parallelism strategy
+ if seq_len >= hybrid_config.sequence_parallel_threshold:
+ # Enable sequence parallel for long sequences
+ enable_sp = True
+ else:
+ enable_sp = hybrid_config.enable_sequence_parallel
+
+ # Calculate parallelism degrees
+ if hybrid_config.prefer_pipeline_parallel and num_nodes > 1:
+ # Prefer pipeline parallel across nodes
+ pp_size = int(min(num_nodes, 8)) # Limit pipeline stages
+ tp_size = int(min(gpus_per_node, 8)) # Tensor parallel within node
+ dp_size = int(total_gpus // (pp_size * tp_size))
+ else:
+ # Default: maximize data parallel
+ tp_size = 1
+ pp_size = 1
+ dp_size = int(total_gpus)
+
+ # Ensure all values are at least 1
+ tp_size = max(1, tp_size)
+ pp_size = max(1, pp_size)
+ dp_size = max(1, dp_size)
+
+ return ParallelismConfig(
+ tensor_parallel_size=tp_size,
+ pipeline_parallel_size=pp_size,
+ data_parallel_size=dp_size,
+ sequence_parallel=enable_sp,
+ )
+
+ def _calculate_allreduce_overhead(self, model_size_bytes: int) -> float:
+ """Calculate AllReduce communication overhead.
+
+ AllReduce is used for gradient averaging in data parallel training.
+ Algorithm: Ring AllReduce with O(2 * model_size) communication.
+
+ Args:
+ model_size_bytes: Model size in bytes
+
+ Returns:
+ Communication volume in GB
+ """
+ # Ring AllReduce: each GPU sends/receives 2 * model_size / num_gpus
+ # But we need the total across the network
+
+ # For gradient averaging: 2 * model_size (send + receive)
+ allreduce_bytes = 2 * model_size_bytes
+
+ # Adjust for collective operation efficiency
+ # In multi-node, cross-node traffic is the bottleneck
+ if self.node_config.is_multi_node:
+ # Only cross-node traffic matters
+ allreduce_bytes = int(allreduce_bytes / self.node_config.num_nodes)
+
+ return allreduce_bytes / (1024**3)
+
+ def _calculate_allgather_overhead(self, model_size_bytes: int) -> float:
+ """Calculate AllGather communication overhead.
+
+ AllGather is used in ZeRO-3 and tensor parallel for parameter gathering.
+
+ Args:
+ model_size_bytes: Model size in bytes
+
+ Returns:
+ Communication volume in GB
+ """
+ # AllGather: (num_gpus - 1) * model_size / num_gpus per GPU
+ # But for ZeRO-3, we gather all parameters
+ is_zero3 = (
+ self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 3
+ )
+
+ if is_zero3:
+ # ZeRO-3 gathers all parameters during forward pass
+ allgather_bytes = model_size_bytes
+ else:
+ # Standard allgather for tensor parallel
+ allgather_bytes = int(model_size_bytes / self.parallelism_config.tensor_parallel_size)
+
+ # Adjust for multi-node
+ if self.node_config.is_multi_node:
+ allgather_bytes = int(allgather_bytes / self.node_config.num_nodes)
+
+ return allgather_bytes / (1024**3)
+
+ def _calculate_reducescatter_overhead(self, model_size_bytes: int) -> float:
+ """Calculate ReduceScatter communication overhead.
+
+ ReduceScatter is used in ZeRO-2 and gradient sharding.
+
+ Args:
+ model_size_bytes: Model size in bytes
+
+ Returns:
+ Communication volume in GB
+ """
+ is_zero2 = (
+ self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 2
+ )
+
+ if is_zero2:
+ # ZeRO-2 scatters gradients
+ reducescatter_bytes = model_size_bytes
+ else:
+ # Standard reducescatter
+ reducescatter_bytes = int(model_size_bytes / self.parallelism_config.data_parallel_size)
+
+ # Adjust for multi-node
+ if self.node_config.is_multi_node:
+ reducescatter_bytes = int(reducescatter_bytes / self.node_config.num_nodes)
+
+ return reducescatter_bytes / (1024**3)
+
+ def _calculate_pipeline_overhead(self, model_size_bytes: int) -> float:
+ """Calculate pipeline parallel communication overhead.
+
+ Point-to-point communication between pipeline stages.
+
+ Args:
+ model_size_bytes: Model size in bytes
+
+ Returns:
+ Communication volume in GB
+ """
+ if self.parallelism_config.pipeline_parallel_size <= 1:
+ return 0.0
+
+ # Pipeline parallel sends activations between stages
+ # Approximate as layer activations
+ hidden_size = self.model_config.hidden_size
+ seq_len = self.model_config.max_seq_len
+ batch_size = self.training_config.batch_size
+ num_layers = self.model_config.num_layers
+
+ # Activation size per layer
+ activation_bytes = batch_size * seq_len * hidden_size * 2 # FP16/BF16
+
+ # Number of microbatches determines communication frequency
+ # For simplicity, assume num_stages communications per step
+ pp_size = self.parallelism_config.pipeline_parallel_size
+ pipeline_comm_bytes = activation_bytes * (num_layers // pp_size)
+
+ # Adjust for multi-node
+ if self.node_config.is_multi_node:
+ pipeline_comm_bytes = int(pipeline_comm_bytes / self.node_config.num_nodes)
+
+ return pipeline_comm_bytes / (1024**3)
+
+ def _estimate_communication_time_ms(self, total_gb: float) -> float:
+ """Estimate communication time per training step in milliseconds.
+
+ Args:
+ total_gb: Total communication volume in GB
+
+ Returns:
+ Estimated time in milliseconds
+ """
+ if total_gb == 0:
+ return 0.0
+
+ # Get bandwidth in GB/s
+ bandwidth_gbps = self.node_config.get_interconnect_bandwidth_gbps()
+ bandwidth_gbps_per_sec = bandwidth_gbps / 8 # Convert to GB/s
+
+ # Basic time = size / bandwidth
+ time_seconds = total_gb / bandwidth_gbps_per_sec
+
+ # Add latency overhead for collective operations
+ # Typical latency: 10-50 microseconds per hop
+ num_nodes = self.node_config.num_nodes
+ latency_overhead = num_nodes * 0.00005 # 50 microseconds per node
+
+ # Network efficiency factor (not 100% efficient)
+ efficiency = 0.85
+
+ total_time_seconds = (time_seconds / efficiency) + latency_overhead
+
+ return total_time_seconds * 1000 # Convert to ms
+
+ def _get_dtype_bytes(self) -> float:
+ """Get bytes per element based on dtype."""
+ dtype_map = {
+ "fp32": 4,
+ "fp16": 2,
+ "bf16": 2,
+ "int8": 1,
+ "int4": 0.5,
+ }
+ return dtype_map.get(self.training_config.dtype.value, 2)
+
+ def _calculate_model_size_gb(self) -> float:
+ """Calculate model size in GB."""
+ dtype_bytes = self._get_dtype_bytes()
+ model_size_bytes = self.model_config.num_parameters * dtype_bytes
+ return model_size_bytes / (1024**3)
diff --git a/src/gpu_mem_calculator/engines/__init__.py b/src/gpu_mem_calculator/engines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2fe2071526ef3b72db402629ec275ce4b38cbb
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/__init__.py
@@ -0,0 +1,16 @@
+"""Training engine implementations."""
+
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.engines.deepspeed import DeepSpeedEngine
+from gpu_mem_calculator.engines.fsdp import FSDPEngine
+from gpu_mem_calculator.engines.megatron import MegatronDeepSpeedEngine, MegatronLMEngine
+from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
+
+__all__ = [
+ "BaseEngine",
+ "PyTorchDDPEngine",
+ "DeepSpeedEngine",
+ "MegatronLMEngine",
+ "MegatronDeepSpeedEngine",
+ "FSDPEngine",
+]
diff --git a/src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d77e30e547c6b15aa80dbb5e0dc77611ec4832b7
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb86ec0de44861e075692e6415234cd7466bc08c
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d172ff5cf1b1f65432e54aa0a46b1975561890bf
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da56619169431d8b808eb6a2dcd24573239ba03c
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87716550d68026c49d48b1ffa1ca58e5c90f58ef
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8444805000d819888fb32df24f13d2061a3b0f0d
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/base.py b/src/gpu_mem_calculator/engines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12e4e7641e25e8adfe649c6172710b8363bff9d
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/base.py
@@ -0,0 +1,220 @@
+"""Base class for training engine implementations."""
+
+from abc import ABC, abstractmethod
+
+from gpu_mem_calculator.core.models import (
+ EngineConfig,
+ GPUConfig,
+ MemoryBreakdown,
+ MemoryResult,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+
+class BaseEngine(ABC):
+ """Abstract base class for training engine memory calculation.
+
+ Each training engine (PyTorch DDP, DeepSpeed, Megatron-LM, etc.)
+ should implement this interface to provide engine-specific
+ memory calculations.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig,
+ engine_config: EngineConfig,
+ gpu_config: GPUConfig,
+ node_config: NodeConfig | None = None,
+ ) -> None:
+ """Initialize the engine with configuration.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings
+ engine_config: Engine-specific configuration
+ gpu_config: Hardware configuration
+ node_config: Multi-node configuration (optional)
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config
+ self.engine_config = engine_config
+ self.gpu_config = gpu_config
+ self.node_config = node_config or NodeConfig()
+
+ @abstractmethod
+ def calculate_memory(self) -> MemoryResult:
+ """Calculate memory requirements for this engine.
+
+ This is the main method that should be implemented by each engine.
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ pass
+
+ def _check_feasibility(
+ self,
+ total_memory_per_gpu: float,
+ ) -> tuple[bool, float, int | None]:
+ """Check if the configuration fits on available GPU.
+
+ Args:
+ total_memory_per_gpu: Total memory required per GPU
+
+ Returns:
+ Tuple of (fits_on_gpu, utilization_percent, recommended_batch_size)
+ """
+ available_memory = self.gpu_config.gpu_memory_gb
+ utilization_percent = (total_memory_per_gpu / available_memory) * 100
+
+ fits_on_gpu = total_memory_per_gpu <= available_memory
+
+ # If doesn't fit, suggest a smaller batch size
+ recommended_batch_size = None
+ if not fits_on_gpu:
+ # Simple heuristic: scale batch size inversely with memory excess
+ excess_factor = total_memory_per_gpu / available_memory
+ recommended_batch_size = max(1, int(self.training_config.batch_size / excess_factor))
+
+ return fits_on_gpu, utilization_percent, recommended_batch_size
+
+ def _create_result(
+ self,
+ breakdown: MemoryBreakdown,
+ cpu_memory_gb: float = 0.0,
+ ) -> MemoryResult:
+ """Create a MemoryResult from breakdown.
+
+ Args:
+ breakdown: Memory breakdown by component
+ cpu_memory_gb: CPU memory required (default 0)
+
+ Returns:
+ Complete MemoryResult
+ """
+ total_memory_per_gpu = breakdown.total_memory_gb
+ total_memory_all_gpus = total_memory_per_gpu * self.gpu_config.num_gpus
+
+ fits_on_gpu, utilization_percent, recommended_batch_size = self._check_feasibility(
+ total_memory_per_gpu
+ )
+
+ # Calculate network overhead for multi-node configurations
+ network_overhead = None
+ multi_node_info = None
+ if self.node_config.is_multi_node:
+ from gpu_mem_calculator.core.multinode import MultiNodeCalculator
+
+ multinode_calc = MultiNodeCalculator(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ node_config=self.node_config,
+ engine_config=self.engine_config,
+ )
+ network_overhead = multinode_calc.calculate_network_overhead()
+
+ # Add multi-node info
+ multi_node_info = {
+ "num_nodes": self.node_config.num_nodes,
+ "gpus_per_node": self.node_config.gpus_per_node,
+ "interconnect_type": self.node_config.interconnect_type.value,
+ "interconnect_bandwidth_gbps": self.node_config.get_interconnect_bandwidth_gbps(),
+ }
+
+ return MemoryResult(
+ total_memory_per_gpu_gb=total_memory_per_gpu,
+ total_memory_all_gpus_gb=total_memory_all_gpus,
+ cpu_memory_gb=cpu_memory_gb,
+ breakdown=breakdown,
+ network_overhead=network_overhead,
+ fits_on_gpu=fits_on_gpu,
+ memory_utilization_percent=utilization_percent,
+ recommended_batch_size=recommended_batch_size,
+ multi_node_info=multi_node_info,
+ )
+
+ @property
+ def effective_batch_size(self) -> int:
+ """Calculate effective batch size with gradient accumulation."""
+ return (
+ self.training_config.batch_size
+ * self.training_config.gradient_accumulation_steps
+ * self.parallelism_config.data_parallel_size
+ )
+
+ @property
+ def total_num_gpus(self) -> int:
+ """Get total number of GPUs."""
+ return self.gpu_config.num_gpus
+
+ @property
+ def num_gpus_per_model(self) -> int:
+ """Get number of GPUs per model replica.
+
+ This is tensor_parallel * pipeline_parallel for distributed training.
+ """
+ return (
+ self.parallelism_config.tensor_parallel_size
+ * self.parallelism_config.pipeline_parallel_size
+ )
+
+ def calculate_moe_activation_multiplier(self) -> float:
+ """Calculate activation memory multiplier for MoE models.
+
+ For MoE models, activation memory depends on top_k (active experts per token)
+ rather than total number of experts. This is because only top_k experts
+ are activated per token during forward/backward pass.
+
+ Returns:
+ Multiplier for activation memory (1.0 for dense models, <1 for MoE)
+ """
+ if not self.model_config.moe_enabled:
+ return 1.0
+
+ # For MoE: only top_k experts are active per token
+ # Activation memory scales with active_experts / total_experts
+ # But we also have router overhead and gating network activations
+
+ num_experts = self.model_config.num_experts
+ top_k = self.model_config.top_k
+
+ # Base activation ratio: only top_k experts active
+ activation_ratio = top_k / num_experts
+
+ # Add router overhead (typically 5-15% extra for gating)
+ router_overhead = 0.1
+
+ # For models with shared experts (like GLM), adjust accordingly
+ if self.model_config.shared_expert_intermediate_size:
+ # Shared expert is always active, so add its contribution
+ # This is a simplified approximation
+ activation_ratio = activation_ratio + (1.0 / num_experts)
+
+ return min(1.0, activation_ratio + router_overhead)
+
+ def calculate_moe_parameter_ratio(self) -> float:
+ """Calculate effective parameter ratio for MoE models.
+
+ For MoE models, only top_k experts are used during forward pass,
+ but all expert parameters are stored in memory.
+
+ Returns:
+ Ratio of active parameters to total parameters (for memory estimation)
+ """
+ if not self.model_config.moe_enabled:
+ return 1.0
+
+ # All expert parameters are stored, but only top_k are used per token
+ # For gradient calculation, we need gradients for all experts
+ # So parameter storage = 1.0 (all params stored)
+ # But we can use this for inference-specific calculations
+
+ return 1.0 # All parameters stored in memory
diff --git a/src/gpu_mem_calculator/engines/deepspeed.py b/src/gpu_mem_calculator/engines/deepspeed.py
new file mode 100644
index 0000000000000000000000000000000000000000..40399064bc2aa86f636047593517e333eae48369
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/deepspeed.py
@@ -0,0 +1,316 @@
+"""DeepSpeed ZeRO engine implementation.
+
+Implements memory calculations for DeepSpeed ZeRO stages 1, 2, and 3.
+Based on: https://deepspeed.readthedocs.io/en/latest/memory.html
+"""
+
+from gpu_mem_calculator.core.formulas import (
+ calculate_activation_memory,
+ calculate_overhead,
+ estimate_largest_layer_params,
+)
+from gpu_mem_calculator.core.models import (
+ MemoryBreakdown,
+ MemoryResult,
+ OffloadDevice,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+
+class DeepSpeedEngine(BaseEngine):
+ """DeepSpeed ZeRO memory calculation.
+
+ Implements ZeRO stages:
+ - ZeRO-1: Shard optimizer states
+ - ZeRO-2: Shard optimizer states + gradients
+ - ZeRO-3: Shard optimizer states + gradients + parameters
+ """
+
+ def calculate_memory(self) -> MemoryResult:
+ """Calculate memory requirements for DeepSpeed ZeRO training.
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ zero_stage = self.engine_config.zero_stage or 0
+ offload_optimizer = self.engine_config.offload_optimizer
+ offload_param = self.engine_config.offload_param
+
+ # Get largest layer params for ZeRO-3
+ if self.model_config.largest_layer_params is None:
+ largest_layer_params = estimate_largest_layer_params(
+ hidden_size=self.model_config.hidden_size,
+ num_attention_heads=self.model_config.num_attention_heads,
+ )
+ else:
+ largest_layer_params = self.model_config.largest_layer_params
+
+ match zero_stage:
+ case 0:
+ return self._calculate_zero0()
+ case 1:
+ return self._calculate_zero1(offload_optimizer)
+ case 2:
+ return self._calculate_zero2(offload_optimizer)
+ case 3:
+ return self._calculate_zero3(
+ offload_optimizer,
+ offload_param,
+ largest_layer_params,
+ )
+ case _:
+ # Default to ZeRO-2
+ return self._calculate_zero2(offload_optimizer)
+
+ def _calculate_zero0(self) -> MemoryResult:
+ """Calculate memory for ZeRO-0 (disabled, same as PyTorch DDP)."""
+ # Import here to avoid circular dependency
+ from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
+
+ # ZeRO-0 is the same as PyTorch DDP
+ ddp_engine = PyTorchDDPEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ )
+ return ddp_engine.calculate_memory()
+
+ def _calculate_zero1(
+ self,
+ offload_optimizer: OffloadDevice,
+ ) -> MemoryResult:
+ """Calculate memory for ZeRO-1 (shard optimizer states).
+
+ ZeRO-1 shards optimizer states across data parallel GPUs.
+
+ Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+ Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+
+ Memory formula:
+ - offload_optimizer=cpu: 2 * params (fp16 params only on GPU)
+ - offload_optimizer=none: 4 * params (fp16 params + fp32 params) +
+ 12 * params / num_gpus (sharded optimizer states)
+
+ Note: Optimizer states = 12 bytes per param for Adam/AdamW
+ - 4 bytes: FP32 parameter copy
+ - 4 bytes: Momentum (FP32)
+ - 4 bytes: Variance (FP32)
+ """
+ num_params = self.model_config.num_parameters
+ num_gpus = self.total_num_gpus
+
+ # Model parameters (fp16/bf16 on GPU)
+ model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16 = 2 bytes
+
+ # Gradients (fp16 on GPU)
+ gradients_gb = gb_from_bytes(num_params * 2)
+
+ # Optimizer states (sharded across GPUs, possibly offloaded to CPU)
+ # 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
+ if offload_optimizer == OffloadDevice.CPU:
+ # Offloaded to CPU, minimal GPU memory for optimizer
+ optimizer_gb = 0.0
+ cpu_memory_gb = gb_from_bytes(num_params * 12) # Full optimizer on CPU
+ else:
+ # Sharded across GPUs: 12 bytes / num_gpus per GPU
+ optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
+ cpu_memory_gb = 0.0
+
+ # Activations (same as baseline)
+ activations_gb = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Overhead
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+ overhead_gb = calculate_overhead(base_memory)
+
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown, cpu_memory_gb)
+
+ def _calculate_zero2(
+ self,
+ offload_optimizer: OffloadDevice,
+ ) -> MemoryResult:
+ """Calculate memory for ZeRO-2 (shard optimizer + gradients).
+
+ ZeRO-2 shards optimizer states AND gradients across data parallel GPUs.
+
+ Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+ Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+
+ Memory formula:
+ - offload_optimizer=cpu: 2 * params (fp16 params) +
+ (2 * params / num_gpus) (sharded fp16 grads)
+ - offload_optimizer=none: 2 * params (fp16 params) +
+ 2 * params / num_gpus (sharded fp16 grads) +
+ 12 * params / num_gpus (sharded optimizer states)
+
+ Note: Unlike ZeRO-1, ZeRO-2 shards gradients across GPUs
+ """
+ num_params = self.model_config.num_parameters
+ num_gpus = self.total_num_gpus
+
+ # Model parameters (fp16/bf16 on GPU) - NOT sharded in ZeRO-2
+ model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16 = 2 bytes
+
+ # Gradients (fp16 on GPU) - SHARDED in ZeRO-2
+ gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+ # Optimizer states (sharded across GPUs, possibly offloaded to CPU)
+ # 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
+ if offload_optimizer == OffloadDevice.CPU:
+ # Offloaded to CPU, minimal GPU memory for optimizer
+ optimizer_gb = 0.0
+ cpu_memory_gb = gb_from_bytes(num_params * 12) # Full optimizer on CPU
+ else:
+ # Sharded across GPUs: 12 bytes / num_gpus per GPU
+ optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
+ cpu_memory_gb = 0.0
+
+ # Activations (same as baseline)
+ activations_gb = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Overhead
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+ overhead_gb = calculate_overhead(base_memory)
+
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown, cpu_memory_gb)
+
+ def _calculate_zero3(
+ self,
+ offload_optimizer: OffloadDevice,
+ offload_param: OffloadDevice,
+ largest_layer_params: int,
+ ) -> MemoryResult:
+ """Calculate memory for ZeRO-3 (shard params + optimizer + gradients).
+
+ ZeRO-3 shards everything across GPUs.
+
+ Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+ Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+
+ Memory formula:
+ - largest_layer_memory = 4 * largest_layer_params (fp16 params + fp16 grads)
+
+ Case 1 (no offload):
+ largest_layer_memory + 18 * params / num_gpus
+ (where 18 = 16 bytes optimizer states + 2 bytes fp16 params)
+
+ Case 2 (param + optimizer offload to CPU):
+ largest_layer_memory (main limit is CPU RAM)
+
+ Case 3 (optimizer offload to CPU only):
+ largest_layer_memory + 2 * params / num_gpus
+
+ Note: Optimizer states = 16 bytes per param for Adam/AdamW (FP32)
+ - 4 bytes: FP32 parameter copy
+ - 4 bytes: Momentum (FP32)
+ - 4 bytes: Variance (FP32)
+ - 4 bytes: Gradient (FP32 copy for optimizer update)
+ """
+ num_params = self.model_config.num_parameters
+ num_gpus = self.total_num_gpus
+
+ # Largest layer memory (fp16 params + fp16 grads gathered on one GPU)
+ largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
+
+ # Calculate memory based on offload configuration
+ if offload_param == OffloadDevice.CPU and offload_optimizer == OffloadDevice.CPU:
+ # Case 2: Both params and optimizer offloaded to CPU
+ # Only need largest layer on GPU at a time
+ params_per_gpu_gb = 0.0
+ gradients_per_gpu_gb = 0.0
+ optimizer_gb = 0.0
+ cpu_memory_gb = gb_from_bytes(num_params * 18) # Full model on CPU
+ elif offload_optimizer == OffloadDevice.CPU:
+ # Case 3: Only optimizer offloaded to CPU
+ params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+ gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+ optimizer_gb = 0.0
+ cpu_memory_gb = gb_from_bytes(num_params * 16) # Optimizer on CPU
+ else:
+ # Case 1: No offload
+ params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+ gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+ optimizer_gb = gb_from_bytes((num_params * 16) / num_gpus) # FP32
+ cpu_memory_gb = 0.0
+
+ # Model params = largest layer for ZeRO-3
+ model_params_gb = largest_layer_memory_gb
+
+ # Activations
+ activations_gb = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Overhead
+ base_memory = (
+ model_params_gb
+ + params_per_gpu_gb
+ + gradients_per_gpu_gb
+ + optimizer_gb
+ + activations_gb
+ )
+ overhead_gb = calculate_overhead(base_memory)
+
+ # For ZeRO-3, we combine params/gradients/optimizer into model_params in breakdown
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb + params_per_gpu_gb,
+ gradients_gb=gradients_per_gpu_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown, cpu_memory_gb)
diff --git a/src/gpu_mem_calculator/engines/fsdp.py b/src/gpu_mem_calculator/engines/fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a9a71a0b819b2f05fe1cfa593a4e88e59c830c
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/fsdp.py
@@ -0,0 +1,213 @@
+"""FSDP (Fully Sharded Data Parallel) engine implementation.
+
+Implements memory calculations for PyTorch FSDP.
+
+Reference: https://pytorch.org/docs/stable/fsdp.html
+Reference: https://blog.eleuther.ai/transformer-math/
+"""
+
+from gpu_mem_calculator.core.formulas import (
+ calculate_activation_memory,
+ calculate_overhead,
+ estimate_largest_layer_params,
+)
+from gpu_mem_calculator.core.models import (
+ MemoryBreakdown,
+ MemoryResult,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+
+class FSDPEngine(BaseEngine):
+ """PyTorch FSDP memory calculation.
+
+ FSDP shards model parameters, gradients, and optimizer states
+ across data parallel GPUs, similar to DeepSpeed ZeRO-3.
+
+ Sharding strategies:
+ - NO_SHARD: Equivalent to DDP (no sharding)
+ - SHARD_GRAD_OP: Shard gradients and optimizer states (like ZeRO-2)
+ - FULL_SHARD: Shard everything (like ZeRO-3)
+ """
+
+ def calculate_memory(self) -> MemoryResult:
+ """Calculate memory requirements for FSDP training.
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ sharding_strategy = self.engine_config.sharding_strategy
+
+ # Get largest layer params for FULL_SHARD
+ if self.model_config.largest_layer_params is None:
+ largest_layer_params = estimate_largest_layer_params(
+ hidden_size=self.model_config.hidden_size,
+ num_attention_heads=self.model_config.num_attention_heads,
+ )
+ else:
+ largest_layer_params = self.model_config.largest_layer_params
+
+ match sharding_strategy:
+ case "no_shard":
+ return self._calculate_no_shard()
+ case "shard_grad_op":
+ return self._calculate_shard_grad_op()
+ case "full_shard":
+ return self._calculate_full_shard(largest_layer_params)
+ case _:
+ # Default to full shard
+ return self._calculate_full_shard(largest_layer_params)
+
+ def _calculate_no_shard(self) -> MemoryResult:
+ """Calculate memory for NO_SHARD (same as DDP).
+
+ No sharding - each GPU holds a full copy of the model.
+ """
+ # Import PyTorch DDP engine
+ from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
+
+ ddp_engine = PyTorchDDPEngine(
+ model_config=self.model_config,
+ training_config=self.training_config,
+ parallelism_config=self.parallelism_config,
+ engine_config=self.engine_config,
+ gpu_config=self.gpu_config,
+ )
+ return ddp_engine.calculate_memory()
+
+ def _calculate_shard_grad_op(self) -> MemoryResult:
+ """Calculate memory for SHARD_GRAD_OP.
+
+ Shards gradients and optimizer states across GPUs.
+ Similar to DeepSpeed ZeRO-2.
+
+ Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
+ Reference: https://blog.eleuther.ai/transformer-math/
+
+ Memory formula:
+ - Model parameters: Full model on each GPU (not sharded)
+ - Gradients: Sharded across GPUs
+ - Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW)
+
+ Note: Optimizer states = 12 bytes per param for Adam/AdamW
+ - 4 bytes: FP32 parameter copy
+ - 4 bytes: Momentum (FP32)
+ - 4 bytes: Variance (FP32)
+ """
+ num_params = self.model_config.num_parameters
+ num_gpus = self.total_num_gpus
+
+ # Model parameters (full model on each GPU)
+ model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16
+
+ # Gradients (sharded)
+ gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+ # Optimizer states (sharded) - 12 bytes per param for Adam/AdamW
+ optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus) # FP32
+
+ # Activations
+ activations_gb = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Overhead
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+ overhead_gb = calculate_overhead(base_memory)
+
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_full_shard(self, largest_layer_params: int) -> MemoryResult:
+ """Calculate memory for FULL_SHARD.
+
+ Shards parameters, gradients, and optimizer states.
+ Similar to DeepSpeed ZeRO-3.
+
+ Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
+ Reference: https://blog.eleuther.ai/transformer-math/
+
+ Memory formula:
+ - Largest layer: 4 * largest_layer_params (fp16 params + fp16 grads)
+ - Remaining parameters and gradients: Sharded across GPUs (2 bytes fp16 each)
+ - Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW in FP32)
+
+ Total per GPU: largest_layer_memory + 2 * params / num_gpus +
+ 2 * params / num_gpus + 12 * params / num_gpus
+ = largest_layer_memory + 16 * params / num_gpus
+
+ Note: FSDP typically uses 12 bytes for optimizer states (not 16 like DeepSpeed ZeRO-3)
+ because FSDP doesn't keep an additional FP32 gradient copy in the optimizer states.
+ """
+ num_params = self.model_config.num_parameters
+ num_gpus = self.total_num_gpus
+
+ # Largest layer memory (fp16 params + fp16 grads gathered during compute)
+ largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
+
+ # Sharded parameters (fp16)
+ params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+ # Sharded gradients (fp16)
+ gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+ # Sharded optimizer states (FP32 for Adam/AdamW)
+ # 12 bytes per param: 4 bytes fp32 params copy + 4 bytes momentum + 4 bytes variance
+ optimizer_per_gpu_gb = gb_from_bytes((num_params * 12) / num_gpus)
+
+ # Model params in breakdown: largest layer (gathered) + sharded params
+ # This represents the total parameter memory on each GPU
+ model_params_gb = largest_layer_memory_gb + params_per_gpu_gb
+
+ # Activations
+ activations_gb = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Overhead
+ base_memory = (
+ largest_layer_memory_gb
+ + params_per_gpu_gb
+ + gradients_per_gpu_gb
+ + optimizer_per_gpu_gb
+ + activations_gb
+ )
+ overhead_gb = calculate_overhead(base_memory)
+
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_per_gpu_gb,
+ optimizer_states_gb=optimizer_per_gpu_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
diff --git a/src/gpu_mem_calculator/engines/megatron.py b/src/gpu_mem_calculator/engines/megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cacfdaca3b906b1058c25f3c61626da8bc079df
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/megatron.py
@@ -0,0 +1,257 @@
+"""Megatron-LM engine implementation.
+
+Implements memory calculations for Megatron-LM with tensor, pipeline,
+and sequence parallelism.
+
+Reference: https://github.com/NVIDIA/Megatron-LM
+Reference: https://arxiv.org/abs/1909.08053
+Reference: https://blog.eleuther.ai/transformer-math/
+"""
+
+from gpu_mem_calculator.core.formulas import (
+ calculate_activation_memory,
+ calculate_gradient_memory,
+ calculate_optimizer_memory,
+ calculate_overhead,
+ calculate_parameter_memory,
+)
+from gpu_mem_calculator.core.models import (
+ MemoryBreakdown,
+ MemoryResult,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+
+class MegatronLMEngine(BaseEngine):
+ """Megatron-LM memory calculation.
+
+ Megatron-LM uses tensor parallelism to split individual layers across GPUs,
+ and optionally pipeline parallelism to split layers across GPUs.
+ """
+
+ def calculate_memory(self) -> MemoryResult:
+ """Calculate memory requirements for Megatron-LM training.
+
+ Megatron-LM memory characteristics:
+ - Parameters are sharded across tensor parallel GPUs
+ - Gradients are sharded across tensor parallel GPUs
+ - Optimizer states can be sharded or replicated
+ - Activations depend on tensor/pipeline/sequence parallelism
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ tp_size = self.parallelism_config.tensor_parallel_size
+ pp_size = self.parallelism_config.pipeline_parallel_size
+ seq_parallel = self.parallelism_config.sequence_parallel
+
+ # 1. Model parameters (sharded by tensor parallelism)
+ # Each TP GPU holds 1/tp of the parameters
+ params_per_gpu = self.model_config.num_parameters / tp_size
+ model_params_gb = calculate_parameter_memory(
+ num_params=int(params_per_gpu),
+ dtype=self.training_config.dtype.value,
+ )
+
+ # 2. Gradients (sharded by tensor parallelism)
+ gradients_gb = calculate_gradient_memory(
+ num_params=int(params_per_gpu),
+ dtype=self.training_config.dtype.value,
+ )
+
+ # 3. Optimizer states
+ # In Megatron-LM, optimizer states are typically sharded similarly to parameters
+ # for tensor parallelism, but this can vary based on configuration
+ optimizer_gb = calculate_optimizer_memory(
+ num_params=int(params_per_gpu),
+ optimizer=self.training_config.optimizer.value,
+ )
+
+ # 4. Activations
+ # Activations are affected by:
+ # - Tensor parallelism: splits activations across TP GPUs
+ # - Pipeline parallelism: only holds activations for current stage
+ # - Sequence parallelism: splits sequence dimension
+ activations_gb = self._calculate_megatron_activations(
+ tp_size=tp_size,
+ pp_size=pp_size,
+ seq_parallel=seq_parallel,
+ )
+
+ # 5. Overhead
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+ overhead_gb = calculate_overhead(base_memory)
+
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_megatron_activations(
+ self,
+ tp_size: int,
+ pp_size: int,
+ seq_parallel: bool,
+ ) -> float:
+ """Calculate activation memory for Megatron-LM.
+
+ Megatron-LM activations are affected by parallelism strategy:
+ - Tensor parallelism: splits hidden dimension
+ - Pipeline parallelism: only current stage's activations
+ - Sequence parallelism: splits sequence dimension
+
+ Args:
+ tp_size: Tensor parallelism size
+ pp_size: Pipeline parallelism size
+ seq_parallel: Whether sequence parallelism is enabled
+
+ Returns:
+ Activation memory in GB
+ """
+
+ # Base activation memory
+ base_activations = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=tp_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Adjust for pipeline parallelism
+ # Each PP stage only holds num_layers / pp_size layers
+ pp_factor = 1.0 / pp_size
+
+ # Adjust for sequence parallelism
+ # If enabled, splits sequence dimension across TP GPUs
+ if seq_parallel and tp_size > 1:
+ seq_factor = 1.0 / tp_size
+ else:
+ seq_factor = 1.0
+
+ return base_activations * pp_factor * seq_factor
+
+
+class MegatronDeepSpeedEngine(BaseEngine):
+ """Megatron-LM + DeepSpeed combined engine.
+
+ This combines Megatron-LM's tensor/pipeline parallelism with
+ DeepSpeed ZeRO's optimizer/gradient sharding.
+ """
+
+ def calculate_memory(self) -> MemoryResult:
+ """Calculate memory for Megatron-LM + DeepSpeed.
+
+ This uses:
+ - Megatron-LM for tensor/pipeline parallelism and activation memory
+ - DeepSpeed ZeRO for optimizer/gradient sharding
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ # Import DeepSpeed engine
+
+ # First calculate activation memory using Megatron-LM approach
+ tp_size = self.parallelism_config.tensor_parallel_size
+ pp_size = self.parallelism_config.pipeline_parallel_size
+ seq_parallel = self.parallelism_config.sequence_parallel
+
+ activations_gb = self._calculate_megatron_activations(
+ tp_size=tp_size,
+ pp_size=pp_size,
+ seq_parallel=seq_parallel,
+ )
+
+ # For parameters, gradients, optimizer - use DeepSpeed ZeRO logic
+ # But account for tensor parallelism (parameters are already split by TP)
+ tp_size = self.parallelism_config.tensor_parallel_size
+ params_per_gpu = self.model_config.num_parameters / tp_size
+
+ zero_stage = self.engine_config.zero_stage or 2
+ offload_optimizer = self.engine_config.offload_optimizer
+
+ # Model parameters (sharded by TP, then possibly by ZeRO)
+ if zero_stage >= 3:
+ # ZeRO-3 shards further
+ dp_size = self.parallelism_config.data_parallel_size
+ model_params_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
+ else:
+ # ZeRO-0/1/2 keeps parameters on each TP GPU
+ model_params_gb = gb_from_bytes(params_per_gpu * 2)
+
+ # Gradients
+ if zero_stage >= 2:
+ dp_size = self.parallelism_config.data_parallel_size
+ gradients_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
+ else:
+ gradients_gb = gb_from_bytes(params_per_gpu * 2)
+
+ # Optimizer states (12 bytes per param for Adam/AdamW in FP32)
+ if offload_optimizer.value == "cpu":
+ optimizer_gb = 0.0
+ else:
+ if zero_stage >= 1:
+ dp_size = self.parallelism_config.data_parallel_size
+ optimizer_gb = gb_from_bytes((params_per_gpu * 12) / dp_size)
+ else:
+ optimizer_gb = gb_from_bytes(params_per_gpu * 12)
+
+ # Overhead
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+ overhead_gb = gb_from_bytes(base_memory * 0.2)
+
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_megatron_activations(
+ self,
+ tp_size: int,
+ pp_size: int,
+ seq_parallel: bool,
+ ) -> float:
+ """Calculate activation memory for Megatron-LM."""
+
+ # Base activation memory
+ base_activations = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=tp_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # Adjust for pipeline parallelism
+ pp_factor = 1.0 / pp_size
+
+ # Adjust for sequence parallelism
+ if seq_parallel and tp_size > 1:
+ seq_factor = 1.0 / tp_size
+ else:
+ seq_factor = 1.0
+
+ return base_activations * pp_factor * seq_factor
diff --git a/src/gpu_mem_calculator/engines/pytorch.py b/src/gpu_mem_calculator/engines/pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed3d8ae9df75c7870d7929617214e67cfc8a3dd7
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/pytorch.py
@@ -0,0 +1,88 @@
+"""PyTorch DDP (Distributed Data Parallel) engine implementation.
+
+This is the baseline implementation without any memory optimizations.
+
+Reference: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+Reference: https://blog.eleuther.ai/transformer-math/
+"""
+
+from gpu_mem_calculator.core.formulas import (
+ calculate_activation_memory,
+ calculate_gradient_memory,
+ calculate_optimizer_memory,
+ calculate_overhead,
+ calculate_parameter_memory,
+)
+from gpu_mem_calculator.core.models import (
+ MemoryBreakdown,
+ MemoryResult,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+
+
+class PyTorchDDPEngine(BaseEngine):
+ """PyTorch DDP memory calculation.
+
+ DDP replicates the model on each GPU, so memory is not sharded.
+ Each GPU holds a full copy of the model, gradients, and optimizer states.
+ """
+
+ def calculate_memory(self) -> MemoryResult:
+ """Calculate memory requirements for PyTorch DDP training.
+
+ For DDP:
+ - Model parameters: Full model on each GPU
+ - Gradients: Full gradients on each GPU
+ - Optimizer states: Full optimizer states on each GPU (FP32)
+ - Activations: Batch size dependent, split by data parallel
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ # 1. Model parameters (in the specified dtype)
+ model_params_gb = calculate_parameter_memory(
+ num_params=self.model_config.num_parameters,
+ dtype=self.training_config.dtype.value,
+ )
+
+ # 2. Gradients (same precision as parameters for mixed precision)
+ gradients_gb = calculate_gradient_memory(
+ num_params=self.model_config.num_parameters,
+ dtype=self.training_config.dtype.value,
+ )
+
+ # 3. Optimizer states (always FP32 for Adam/AdamW)
+ optimizer_gb = calculate_optimizer_memory(
+ num_params=self.model_config.num_parameters,
+ optimizer=self.training_config.optimizer.value,
+ )
+
+ # 4. Activations (depends on batch size and model architecture)
+ activations_gb = calculate_activation_memory(
+ batch_size=self.training_config.batch_size,
+ seq_len=self.model_config.max_seq_len,
+ hidden_size=self.model_config.hidden_size,
+ num_layers=self.model_config.num_layers,
+ num_attention_heads=self.model_config.num_attention_heads,
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+ activation_checkpointing=self.training_config.activation_checkpointing,
+ moe_enabled=self.model_config.moe_enabled,
+ num_experts=self.model_config.num_experts,
+ top_k=self.model_config.top_k,
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
+ )
+
+ # 5. Calculate overhead
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+ overhead_gb = calculate_overhead(base_memory)
+
+ # Create breakdown
+ breakdown = MemoryBreakdown(
+ model_params_gb=model_params_gb,
+ gradients_gb=gradients_gb,
+ optimizer_states_gb=optimizer_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
diff --git a/src/gpu_mem_calculator/exporters/__init__.py b/src/gpu_mem_calculator/exporters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd4570a402ff9017cf98c8058cb6324f5426ebe
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/__init__.py
@@ -0,0 +1,14 @@
+"""Framework configuration exporters."""
+
+from gpu_mem_calculator.exporters.accelerate import AccelerateExporter
+from gpu_mem_calculator.exporters.axolotl import AxolotlExporter
+from gpu_mem_calculator.exporters.lightning import LightningExporter
+from gpu_mem_calculator.exporters.manager import ExportFormat, ExportManager
+
+__all__ = [
+ "ExportManager",
+ "ExportFormat",
+ "AccelerateExporter",
+ "LightningExporter",
+ "AxolotlExporter",
+]
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49d72f4e608a33580e83f6de232bb5a3065ecbe2
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15b15a1825a045c015b20a455999e8fbcb48dbf3
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0a3eb673d5b78a81d610973234ac6fb947e1777
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a292093753ce8efa721a5353db1b5441bfba33df
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/manager.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/manager.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1326c58a49468421be1f2e7427848ec4900db182
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/manager.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/accelerate.py b/src/gpu_mem_calculator/exporters/accelerate.py
new file mode 100644
index 0000000000000000000000000000000000000000..a60dadf9da95137d194ed3933ce703188b765b66
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/accelerate.py
@@ -0,0 +1,187 @@
+"""HuggingFace Accelerate configuration exporter.
+
+Generates configuration files for HuggingFace Accelerate distributed training.
+"""
+
+from gpu_mem_calculator.core.models import (
+ DType,
+ EngineConfig,
+ EngineType,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+
+class AccelerateExporter:
+ """Export configuration to HuggingFace Accelerate format.
+
+ Accelerate uses a YAML configuration file to configure distributed
+ training strategies including FSDP, DeepSpeed, and multi-GPU setups.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig,
+ engine_config: EngineConfig,
+ node_config: NodeConfig | None = None,
+ ) -> None:
+ """Initialize the Accelerate exporter.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings
+ engine_config: Training engine configuration
+ node_config: Multi-node configuration (optional)
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config
+ self.engine_config = engine_config
+ self.node_config = node_config
+
+ def export(self) -> dict:
+ """Export configuration to Accelerate format.
+
+ Returns:
+ Dictionary compatible with Accelerate config file format
+ """
+ config: dict = {
+ "compute_environment": (
+ "LOCAL_MACHINE"
+ if not self.node_config or self.node_config.num_nodes == 1
+ else "MULTI_GPU"
+ ),
+ "distributed_type": self._get_distributed_type(),
+ "mixed_precision": self._get_mixed_precision(),
+ "downcast_bf16": self._get_downcast_bf16(),
+ }
+
+ # Add multi-GPU configuration
+ if self.node_config and self.node_config.num_nodes > 1:
+ config["num_machines"] = self.node_config.num_nodes
+ config["num_processes"] = self.node_config.gpus_per_node or 1
+ config["main_process_port"] = 29500
+ config["main_training_function"] = "main"
+
+ # Add FSDP configuration if using FSDP
+ if self.engine_config.type == EngineType.FSDP:
+ config["fsdp_config"] = self._get_fsdp_config()
+
+ # Add DeepSpeed configuration if using DeepSpeed
+ if self.engine_config.type == EngineType.DEEPSPEED:
+ config["deepspeed_config"] = self._get_deepspeed_config()
+
+ return config
+
+ def _get_distributed_type(self) -> str:
+ """Get Accelerate distributed type."""
+ if self.engine_config.type == EngineType.FSDP:
+ return "FSDP"
+ elif self.engine_config.type == EngineType.DEEPSPEED:
+ return "DEEPSPEED"
+ elif self.parallelism_config.tensor_parallel_size > 1:
+ return "MEGATRON_LM"
+ elif self.parallelism_config.data_parallel_size > 1:
+ return "MULTI_GPU"
+ else:
+ return "NO"
+
+ def _get_mixed_precision(self) -> str:
+ """Get mixed precision setting."""
+ dtype_map = {
+ DType.BF16: "bf16",
+ DType.FP16: "fp16",
+ DType.FP32: "no",
+ }
+ return dtype_map.get(self.training_config.dtype, "no")
+
+ def _get_downcast_bf16(self) -> str:
+ """Get downcast BF16 setting."""
+ return "no" if self.training_config.dtype == DType.BF16 else "no"
+
+ def _get_fsdp_config(self) -> dict:
+ """Get FSDP-specific configuration."""
+ sharding_strategy_map = {
+ "no_shard": "NO_SHARD",
+ "shard_grad_op": "SHARD_GRAD_OP",
+ "full_shard": "FULL_SHARD",
+ }
+
+ config = {
+ "fsdp_sharding_strategy": sharding_strategy_map.get(
+ self.engine_config.sharding_strategy, "FULL_SHARD"
+ ),
+ "fsdp_offload_params": False,
+ "fsdp_origin_params": True,
+ "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+ "fsdp_transformer_layer_cls_to_wrap": self._get_transformer_layer_cls(),
+ "fsdp_backward_prefetch": "BACKWARD_PRE",
+ "fsdp_forward_prefetch": False,
+ "fsdp_use_orig_params": True,
+ "fsdp_cpu_ram_efficient_loading": True,
+ }
+
+ # Add activation checkpointing if enabled
+ if self.training_config.activation_checkpointing > 0:
+ config["fsdp_activation_checkpointing"] = True
+
+ return config
+
+ def _get_deepspeed_config(self) -> dict:
+ """Get DeepSpeed-specific configuration."""
+ zero_opt: dict = {
+ "stage": self.engine_config.zero_stage or 2,
+ }
+
+ config: dict = {
+ "train_batch_size": self.training_config.batch_size,
+ "train_micro_batch_size_per_gpu": self.training_config.batch_size,
+ "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+ "zero_optimization": zero_opt,
+ "bf16": {"enabled": self.training_config.dtype == DType.BF16},
+ "fp16": {"enabled": self.training_config.dtype == DType.FP16},
+ "gradient_clipping": 1.0,
+ "prescale_gradients": False,
+ "steps_per_print": 100,
+ }
+
+ # Add offload configuration if specified
+ if self.engine_config.offload_optimizer != "none":
+ config["zero_optimization"]["offload_optimizer"] = {
+ "device": "cpu" if self.engine_config.offload_optimizer == "cpu" else "nvme",
+ "pin_memory": True,
+ }
+
+ if self.engine_config.offload_param != "none":
+ config["zero_optimization"]["offload_param"] = {
+ "device": "cpu" if self.engine_config.offload_param == "cpu" else "nvme",
+ "pin_memory": True,
+ }
+
+ return config
+
+ def _get_transformer_layer_cls(self) -> list[str]:
+ """Get transformer layer class names for FSDP auto-wrapping.
+
+ Returns a list of common transformer layer class names based on model architecture.
+ """
+ # Common transformer layer class names
+ common_layers = [
+ "BertLayer",
+ "GPTJBlock",
+ "GPT2Block",
+ "BloomBlock",
+ "LlamaDecoderLayer",
+ "MistralDecoderLayer",
+ "MixtralDecoderLayer",
+ "Qwen2DecoderLayer",
+ "GemmaDecoderLayer",
+ ]
+
+ # Could be customized based on model_config.name
+ return common_layers
diff --git a/src/gpu_mem_calculator/exporters/axolotl.py b/src/gpu_mem_calculator/exporters/axolotl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c5d397f7972de144203362faec743066287ced5
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/axolotl.py
@@ -0,0 +1,238 @@
+"""Axolotl configuration exporter.
+
+Generates configuration files for Axolotl fine-tuning framework.
+"""
+
+from gpu_mem_calculator.core.models import (
+ DType,
+ EngineConfig,
+ EngineType,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+
+class AxolotlExporter:
+ """Export configuration to Axolotl YAML format.
+
+ Axolotl uses a YAML configuration file for fine-tuning LLMs
+ with various backends including DeepSpeed, FSDP, and XLA.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig,
+ engine_config: EngineConfig,
+ node_config: NodeConfig | None = None,
+ ) -> None:
+ """Initialize the Axolotl exporter.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings
+ engine_config: Training engine configuration
+ node_config: Multi-node configuration (optional)
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config
+ self.engine_config = engine_config
+ self.node_config = node_config
+
+ def export(self) -> dict:
+ """Export configuration to Axolotl YAML format.
+
+ Returns:
+ Dictionary compatible with Axolotl config file format
+ """
+ config = {
+ # Base model configuration
+ "base_model": self._get_base_model(),
+ "model_type": self._get_model_type(),
+ # Tokenizer
+ "tokenizer_type": "AutoTokenizer",
+ # Training configuration
+ "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+ "batch_size": self.training_config.batch_size,
+ "micro_batch_size": self.training_config.batch_size,
+ "num_epochs": 3,
+ "learning_rate": 2e-4,
+ "optimizer": (
+ "adamw_bnb_8bit" if self.training_config.dtype == DType.BF16 else "adamw_torch"
+ ),
+ "bf16": self.training_config.dtype == DType.BF16,
+ "fp16": self.training_config.dtype == DType.FP16,
+ "tf32": True,
+ "gradient_checkpointing": self.training_config.activation_checkpointing > 0,
+ }
+
+ # Add special tokens configuration
+ config.update(
+ {
+ "special_tokens": {
+ "bos_token": "",
+ "eos_token": "",
+ "unk_token": "",
+ "pad_token": "",
+ }
+ }
+ )
+
+ # Add distributed training configuration
+ if self.engine_config.type == EngineType.DEEPSPEED:
+ config["deepspeed"] = self._get_deepspeed_config()
+ elif self.engine_config.type == EngineType.FSDP:
+ config["fsdp"] = self._get_fsdp_config()
+
+ # Add multi-GPU configuration
+ if self.node_config and self.node_config.num_nodes > 1:
+ config["num_nodes"] = self.node_config.num_nodes
+ config["gpus_per_node"] = self.node_config.gpus_per_node
+
+ # Add additional training parameters
+ config.update(
+ {
+ "val_set_size": 0.1, # 10% validation
+ "output_dir": "./output",
+ "logging_steps": 10,
+ "save_steps": 100,
+ "eval_steps": 100,
+ "save_total_limit": 2,
+ "lr_scheduler": "cosine",
+ "warmup_ratio": 0.03,
+ "weight_decay": 0.0,
+ "max_grad_norm": 1.0,
+ }
+ )
+
+ return config
+
+ def _get_base_model(self) -> str:
+ """Get base model path/name.
+
+ Returns a placeholder or extracts from model_config.name
+ """
+ # Try to construct a reasonable model path
+ name_map = {
+ "llama2-7b": "meta-llama/Llama-2-7b-hf",
+ "llama2-13b": "meta-llama/Llama-2-13b-hf",
+ "llama2-70b": "meta-llama/Llama-2-70b-hf",
+ "mistral-7b": "mistralai/Mistral-7B-v0.1",
+ "mixtral-8x7b": "mistralai/Mixtral-8x7B-v0.1",
+ "gpt3-175b": "gpt3-175b-placeholder", # Not on HF
+ }
+
+ return name_map.get(self.model_config.name.lower(), self.model_config.name)
+
+ def _get_model_type(self) -> str:
+ """Get model type for Axolotl."""
+ model_type_map = {
+ "llama": "LlamaForCausalLM",
+ "mistral": "MistralForCausalLM",
+ "mixtral": "MixtralForCausalLM",
+ "qwen": "Qwen2ForCausalLM",
+ "gemma": "GemmaForCausalLM",
+ "bloom": "BloomForCausalLM",
+ "gpt2": "GPT2LMHeadModel",
+ "gptj": "GPTJForCausalLM",
+ "bert": "BertForMaskedLM",
+ }
+
+ model_name_lower = self.model_config.name.lower()
+ for key, value in model_type_map.items():
+ if key in model_name_lower:
+ return value
+
+ return "LlamaForCausalLM" # Default
+
+ def _get_deepspeed_config(self) -> dict:
+ """Get DeepSpeed configuration for Axolotl."""
+ zero_opt: dict = {
+ "stage": self.engine_config.zero_stage or 2,
+ }
+
+ config: dict = {
+ "zero_optimization": zero_opt,
+ "bf16": {"enabled": self.training_config.dtype == DType.BF16},
+ "fp16": {"enabled": self.training_config.dtype == DType.FP16},
+ "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+ "train_micro_batch_size_per_gpu": self.training_config.batch_size,
+ "train_batch_size": self.training_config.batch_size
+ * self.training_config.gradient_accumulation_steps,
+ }
+
+ # Add offload configuration
+ if self.engine_config.offload_optimizer != "none":
+ config["zero_optimization"]["offload_optimizer"] = {
+ "device": "cpu" if self.engine_config.offload_optimizer == "cpu" else "nvme",
+ "pin_memory": True,
+ }
+
+ if self.engine_config.offload_param != "none":
+ config["zero_optimization"]["offload_param"] = {
+ "device": "cpu" if self.engine_config.offload_param == "cpu" else "nvme",
+ "pin_memory": True,
+ }
+
+ return config
+
+ def _get_fsdp_config(self) -> dict:
+ """Get FSDP configuration for Axolotl."""
+ sharding_strategy_map = {
+ "no_shard": "NO_SHARD",
+ "shard_grad_op": "SHARD_GRAD_OP",
+ "full_shard": "FULL_SHARD",
+ }
+
+ config = {
+ "fsdp_sharding_strategy": sharding_strategy_map.get(
+ self.engine_config.sharding_strategy, "FULL_SHARD"
+ ),
+ "fsdp_offload_params": False,
+ "fsdp_origin_params": True,
+ "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+ "fsdp_transformer_layer_cls_to_wrap": self._get_transformer_layer_cls(),
+ "fsdp_backward_prefetch": "BACKWARD_PRE",
+ "fsdp_forward_prefetch": False,
+ "fsdp_use_orig_params": True,
+ "fsdp_cpu_ram_efficient_loading": True,
+ }
+
+ return config
+
+ def _get_transformer_layer_cls(self) -> str:
+ """Get transformer layer class name."""
+ model_layer_map = {
+ "llama": "LlamaDecoderLayer",
+ "mistral": "MistralDecoderLayer",
+ "mixtral": "MixtralDecoderLayer",
+ "qwen": "Qwen2DecoderLayer",
+ "gemma": "GemmaDecoderLayer",
+ "bloom": "BloomBlock",
+ "gpt2": "GPT2Block",
+ "gptj": "GPTJBlock",
+ "bert": "BertLayer",
+ }
+
+ model_name_lower = self.model_config.name.lower()
+ for key, value in model_layer_map.items():
+ if key in model_name_lower:
+ return value
+
+ return "LlamaDecoderLayer" # Default
+
+ def export_yaml(self) -> str:
+ """Generate YAML configuration string.
+
+ Returns:
+ YAML-formatted configuration string
+ """
+ import yaml # type: ignore[import-untyped]
+
+ config = self.export()
+ return yaml.dump(config, default_flow_style=False, sort_keys=False) # type: ignore[no-any-return]
diff --git a/src/gpu_mem_calculator/exporters/lightning.py b/src/gpu_mem_calculator/exporters/lightning.py
new file mode 100644
index 0000000000000000000000000000000000000000..116f465afb4663d0b4e4e37d71c8444294506d6a
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/lightning.py
@@ -0,0 +1,219 @@
+"""PyTorch Lightning configuration exporter.
+
+Generates configuration and trainer setup for PyTorch Lightning training.
+"""
+
+from gpu_mem_calculator.core.models import (
+ DType,
+ EngineConfig,
+ EngineType,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+
+
+class LightningExporter:
+ """Export configuration to PyTorch Lightning format.
+
+ Lightning uses a Trainer class with various strategies for distributed
+ training including DDP, FSDP, and DeepSpeed.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig,
+ engine_config: EngineConfig,
+ node_config: NodeConfig | None = None,
+ ) -> None:
+ """Initialize the Lightning exporter.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings
+ engine_config: Training engine configuration
+ node_config: Multi-node configuration (optional)
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config
+ self.engine_config = engine_config
+ self.node_config = node_config
+
+ def export(self) -> dict:
+ """Export configuration to Lightning Trainer format.
+
+ Returns:
+ Dictionary with Trainer configuration
+ """
+ config = {
+ "trainer": {
+ "accelerator": "auto",
+ "devices": self._get_num_devices(),
+ "num_nodes": self._get_num_nodes(),
+ "strategy": self._get_strategy(),
+ "precision": self._get_precision(),
+ "max_epochs": 1, # Placeholder
+ "accumulate_grad_batches": self.training_config.gradient_accumulation_steps,
+ "gradient_clip_val": 1.0,
+ "log_every_n_steps": 50,
+ },
+ "model_config": {
+ "model_name": self.model_config.name,
+ "num_parameters": self.model_config.num_parameters,
+ "hidden_size": self.model_config.hidden_size,
+ "num_layers": self.model_config.num_layers,
+ "num_attention_heads": self.model_config.num_attention_heads,
+ "max_seq_len": self.model_config.max_seq_len,
+ },
+ }
+
+ # Add strategy-specific configuration
+ if self.engine_config.type == EngineType.DEEPSPEED:
+ config["deepspeed_config"] = self._get_deepspeed_config()
+ elif self.engine_config.type == EngineType.FSDP:
+ config["fsdp_config"] = self._get_fsdp_config()
+
+ return config
+
+ def _get_num_devices(self) -> int | str:
+ """Get number of devices."""
+ if self.node_config and self.node_config.gpus_per_node:
+ return self.node_config.gpus_per_node
+ return "auto"
+
+ def _get_num_nodes(self) -> int:
+ """Get number of nodes."""
+ if self.node_config:
+ return self.node_config.num_nodes
+ return 1
+
+ def _get_strategy(self) -> str | dict:
+ """Get Lightning training strategy."""
+ if self.engine_config.type == EngineType.FSDP:
+ return "fsdp"
+ elif self.engine_config.type == EngineType.DEEPSPEED:
+ return "deepspeed"
+ elif self.parallelism_config.data_parallel_size > 1:
+ return "ddp"
+ else:
+ return "auto"
+
+ def _get_precision(self) -> str:
+ """Get precision setting."""
+ dtype_map = {
+ DType.BF16: "bf16-mixed",
+ DType.FP16: "16-mixed",
+ DType.FP32: "32",
+ }
+ return dtype_map.get(self.training_config.dtype, "32")
+
+ def _get_deepspeed_config(self) -> dict:
+ """Get DeepSpeed configuration for Lightning."""
+ zero_opt: dict = {
+ "stage": self.engine_config.zero_stage or 2,
+ }
+
+ config: dict = {
+ "zero_stage": self.engine_config.zero_stage or 2,
+ "zero_optimization": zero_opt,
+ "bf16": {"enabled": self.training_config.dtype == DType.BF16},
+ "fp16": {"enabled": self.training_config.dtype == DType.FP16},
+ "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+ "train_micro_batch_size_per_gpu": self.training_config.batch_size,
+ "train_batch_size": self.training_config.batch_size
+ * self.training_config.gradient_accumulation_steps,
+ }
+
+ # Add offload configuration
+ if self.engine_config.offload_optimizer != "none":
+ config["zero_optimization"]["offload_optimizer"] = {
+ "device": "cpu" if self.engine_config.offload_optimizer == "cpu" else "nvme",
+ }
+
+ if self.engine_config.offload_param != "none":
+ config["zero_optimization"]["offload_param"] = {
+ "device": "cpu" if self.engine_config.offload_param == "cpu" else "nvme",
+ }
+
+ return config
+
+ def _get_fsdp_config(self) -> dict:
+ """Get FSDP configuration for Lightning."""
+ sharding_strategy_map = {
+ "no_shard": "NO_SHARD",
+ "shard_grad_op": "SHARD_GRAD_OP",
+ "full_shard": "FULL_SHARD",
+ }
+
+ config = {
+ "sharding_strategy": sharding_strategy_map.get(
+ self.engine_config.sharding_strategy, "FULL_SHARD"
+ ),
+ "cpu_ram_efficient_loading": True,
+ "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+ "transformer_cls_name": self._get_transformer_cls_name(),
+ "activation_checkpointing": self.training_config.activation_checkpointing > 0,
+ }
+
+ return config
+
+ def _get_transformer_cls_name(self) -> str:
+ """Get transformer class name for FSDP wrapping."""
+ # Map common model names to their layer classes
+ model_layer_map = {
+ "llama": "LlamaDecoderLayer",
+ "mistral": "MistralDecoderLayer",
+ "mixtral": "MixtralDecoderLayer",
+ "qwen": "Qwen2DecoderLayer",
+ "gemma": "GemmaDecoderLayer",
+ "bloom": "BloomBlock",
+ "gpt2": "GPT2Block",
+ "gptj": "GPTJBlock",
+ "bert": "BertLayer",
+ }
+
+ # Try to match based on model name
+ model_name_lower = self.model_config.name.lower()
+ for key, value in model_layer_map.items():
+ if key in model_name_lower:
+ return value
+
+ return "LlamaDecoderLayer" # Default
+
+ def export_code(self) -> str:
+ """Generate Python code for Lightning Trainer setup.
+
+ Returns:
+ String with Python code
+ """
+ config = self.export()
+
+ code = f"""import pytorch_lightning as pl
+from pytorch_lightning.strategies import DeepSpeedStrategy, FSDPStrategy
+
+# Model configuration
+model_config = {config["model_config"]}
+
+# Trainer configuration
+trainer = pl.Trainer(
+ accelerator="{config["trainer"]["accelerator"]}",
+ devices={config["trainer"]["devices"]},
+ num_nodes={config["trainer"]["num_nodes"]},
+ strategy="{config["trainer"]["strategy"]}",
+ precision="{config["trainer"]["precision"]}",
+ max_epochs={config["trainer"]["max_epochs"]},
+ accumulate_grad_batches={config["trainer"]["accumulate_grad_batches"]},
+ gradient_clip_val={config["trainer"]["gradient_clip_val"]},
+ log_every_n_steps={config["trainer"]["log_every_n_steps"]},
+)
+
+# Training loop
+# model = YourModel(model_config)
+# trainer.fit(model)
+"""
+ return code
diff --git a/src/gpu_mem_calculator/exporters/manager.py b/src/gpu_mem_calculator/exporters/manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d5f7fcdacc915d31d7c836d0dc8125fc82f8e8b
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/manager.py
@@ -0,0 +1,223 @@
+"""Export manager for framework configurations.
+
+Provides a unified interface for exporting configurations to various
+training framework formats.
+"""
+
+from enum import Enum
+
+from gpu_mem_calculator.core.models import (
+ EngineConfig,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+from gpu_mem_calculator.exporters.accelerate import AccelerateExporter
+from gpu_mem_calculator.exporters.axolotl import AxolotlExporter
+from gpu_mem_calculator.exporters.lightning import LightningExporter
+
+
+class ExportFormat(str, Enum):
+ """Supported export formats."""
+
+ ACCELERATE = "accelerate"
+ LIGHTNING = "lightning"
+ AXOLOTL = "axolotl"
+ DEEPSPEED = "deepspeed"
+ YAML = "yaml"
+ JSON = "json"
+
+
+class ExportManager:
+ """Unified export manager for all framework configurations.
+
+ This class provides a simple interface to export training
+ configurations to various framework formats.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ training_config: TrainingConfig,
+ parallelism_config: ParallelismConfig,
+ engine_config: EngineConfig,
+ node_config: NodeConfig | None = None,
+ ) -> None:
+ """Initialize the export manager.
+
+ Args:
+ model_config: Model architecture configuration
+ training_config: Training hyperparameters
+ parallelism_config: Parallelism settings
+ engine_config: Training engine configuration
+ node_config: Multi-node configuration (optional)
+ """
+ self.model_config = model_config
+ self.training_config = training_config
+ self.parallelism_config = parallelism_config
+ self.engine_config = engine_config
+ self.node_config = node_config
+
+ # Initialize exporters
+ self.accelerate_exporter = AccelerateExporter(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ node_config=node_config,
+ )
+
+ self.lightning_exporter = LightningExporter(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ node_config=node_config,
+ )
+
+ self.axolotl_exporter = AxolotlExporter(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ node_config=node_config,
+ )
+
+ def export(self, format: ExportFormat | str) -> dict | str:
+ """Export configuration to specified format.
+
+ Args:
+ format: Export format (accelerate, lightning, axolotl, deepspeed, yaml, json)
+
+ Returns:
+ Dictionary or string with exported configuration
+ """
+ format_str = format.value if isinstance(format, ExportFormat) else format
+
+ match format_str:
+ case ExportFormat.ACCELERATE:
+ return self.accelerate_exporter.export()
+ case ExportFormat.LIGHTNING:
+ return self.lightning_exporter.export()
+ case ExportFormat.AXOLOTL:
+ return self.axolotl_exporter.export()
+ case ExportFormat.DEEPSPEED:
+ # DeepSpeed config is embedded in accelerate export
+ config = self.accelerate_exporter.export()
+ return config.get("deepspeed_config", {}) # type: ignore[no-any-return]
+ case ExportFormat.YAML:
+ return self._export_yaml()
+ case ExportFormat.JSON:
+ return self._export_json()
+ case _:
+ raise ValueError(f"Unknown export format: {format}")
+
+ def export_to_file(
+ self,
+ format: ExportFormat | str,
+ filepath: str,
+ ) -> None:
+ """Export configuration to a file.
+
+ Args:
+ format: Export format
+ filepath: Path to output file
+ """
+ config = self.export(format)
+
+ if isinstance(config, dict):
+ if format == ExportFormat.YAML or (
+ isinstance(format, str) and format.lower() == "yaml"
+ ):
+ import yaml # type: ignore[import-untyped]
+
+ with open(filepath, "w") as f:
+ yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+ else:
+ import json
+
+ with open(filepath, "w") as f:
+ json.dump(config, f, indent=2)
+ else:
+ with open(filepath, "w") as f:
+ f.write(config)
+
+ def _export_yaml(self) -> str:
+ """Export configuration to generic YAML format.
+
+ Returns:
+ YAML-formatted configuration string
+ """
+ import yaml # type: ignore[import-untyped]
+
+ config = {
+ "model": {
+ "name": self.model_config.name,
+ "num_parameters": self.model_config.num_parameters,
+ "num_layers": self.model_config.num_layers,
+ "hidden_size": self.model_config.hidden_size,
+ "num_attention_heads": self.model_config.num_attention_heads,
+ "vocab_size": self.model_config.vocab_size,
+ "max_seq_len": self.model_config.max_seq_len,
+ "moe_enabled": self.model_config.moe_enabled,
+ },
+ "training": {
+ "batch_size": self.training_config.batch_size,
+ "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+ "optimizer": self.training_config.optimizer.value,
+ "dtype": self.training_config.dtype.value,
+ "activation_checkpointing": self.training_config.activation_checkpointing,
+ },
+ "parallelism": {
+ "tensor_parallel_size": self.parallelism_config.tensor_parallel_size,
+ "pipeline_parallel_size": self.parallelism_config.pipeline_parallel_size,
+ "data_parallel_size": self.parallelism_config.data_parallel_size,
+ "sequence_parallel": self.parallelism_config.sequence_parallel,
+ },
+ "engine": {
+ "type": self.engine_config.type.value,
+ "zero_stage": self.engine_config.zero_stage,
+ "offload_optimizer": self.engine_config.offload_optimizer.value,
+ "offload_param": self.engine_config.offload_param.value,
+ },
+ }
+
+ # Add node configuration if multi-node
+ if self.node_config and self.node_config.num_nodes > 1:
+ config["multinode"] = {
+ "num_nodes": self.node_config.num_nodes,
+ "gpus_per_node": self.node_config.gpus_per_node,
+ "interconnect_type": self.node_config.interconnect_type.value,
+ }
+
+ return yaml.dump(config, default_flow_style=False, sort_keys=False) # type: ignore[no-any-return]
+
+ def _export_json(self) -> str:
+ """Export configuration to JSON format.
+
+ Returns:
+ JSON-formatted configuration string
+ """
+ import json
+
+ config = {
+ "model": self.model_config.model_dump(),
+ "training": self.training_config.model_dump(),
+ "parallelism": self.parallelism_config.model_dump(),
+ "engine": self.engine_config.model_dump(),
+ }
+
+ # Add node configuration if multi-node
+ if self.node_config:
+ config["multinode"] = self.node_config.model_dump()
+
+ return json.dumps(config, indent=2)
+
+ def get_supported_formats(self) -> list[str]:
+ """Get list of supported export formats.
+
+ Returns:
+ List of format names
+ """
+ return [f.value for f in ExportFormat]
diff --git a/src/gpu_mem_calculator/inference/__init__.py b/src/gpu_mem_calculator/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7f29f09e776240cfef2a71e8a8f2f045556c38
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/__init__.py
@@ -0,0 +1,11 @@
+"""Inference memory calculation module."""
+
+from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
+from gpu_mem_calculator.inference.huggingface import HuggingFaceEngine
+from gpu_mem_calculator.inference.sglang import SGLangEngine
+
+__all__ = [
+ "InferenceMemoryCalculator",
+ "HuggingFaceEngine",
+ "SGLangEngine",
+]
diff --git a/src/gpu_mem_calculator/inference/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f549edd147c1016c57bd3e9d566a362fd8aa02e
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/base.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..510107de82cf36f8cf24fe6272d1a162d64d6091
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/base.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/calculator.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/calculator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b63d6762499335b30f4bdaeba5ca41b0987b0716
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/calculator.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/huggingface.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/huggingface.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3c9d999375640bad52610e5f46959aa4a8cabed
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/huggingface.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/sglang.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/sglang.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73ac1bc60c86c990b249bbfcf9ad7bf7d7ecf449
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/sglang.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/tensorrt_llm.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/tensorrt_llm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd45bbae55c34d36c33fa46fa93badb042c5bc15
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/tensorrt_llm.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/tgi.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/tgi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cd32ee65f1deb416d0bfd72a4d988250003891
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/tgi.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/vllm.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/vllm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..348530adf1b79a493b07d30b1256d3ac47b827fa
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/vllm.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/base.py b/src/gpu_mem_calculator/inference/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e185e21d1d8a930902be6e8ea3c9dda97b71a3e3
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/base.py
@@ -0,0 +1,185 @@
+"""Base class for inference engine implementations."""
+
+from abc import ABC, abstractmethod
+
+from gpu_mem_calculator.core.models import (
+ GPUConfig,
+ InferenceConfig,
+ InferenceMemoryBreakdown,
+ InferenceMemoryResult,
+ ModelConfig,
+)
+
+
+class BaseInferenceEngine(ABC):
+ """Abstract base class for inference engine memory calculation.
+
+ Each inference engine (vLLM, TGI, TensorRT-LLM, etc.)
+ should implement this interface to provide engine-specific
+ memory calculations.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ inference_config: InferenceConfig,
+ gpu_config: GPUConfig,
+ ) -> None:
+ """Initialize the inference engine with configuration.
+
+ Args:
+ model_config: Model architecture configuration
+ inference_config: Inference hyperparameters
+ gpu_config: Hardware configuration
+ """
+ self.model_config = model_config
+ self.inference_config = inference_config
+ self.gpu_config = gpu_config
+
+ @abstractmethod
+ def calculate_memory(self) -> InferenceMemoryResult:
+ """Calculate memory requirements for inference.
+
+ This is the main method that should be implemented by each engine.
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ pass
+
+ def _check_feasibility(
+ self,
+ total_memory_per_gpu: float,
+ ) -> tuple[bool, float, int | None]:
+ """Check if the configuration fits on available GPU.
+
+ Args:
+ total_memory_per_gpu: Total memory required per GPU
+
+ Returns:
+ Tuple of (fits_on_gpu, utilization_percent, max_batch_size)
+ """
+ available_memory = (
+ self.gpu_config.gpu_memory_gb * self.inference_config.gpu_memory_utilization
+ )
+ utilization_percent = (total_memory_per_gpu / self.gpu_config.gpu_memory_gb) * 100
+
+ fits_on_gpu = total_memory_per_gpu <= available_memory
+
+ # Find max batch size that fits
+ max_batch_size = None
+ if fits_on_gpu:
+ # Try to estimate max batch size
+ # This is a simplified heuristic
+ current_batch = self.inference_config.batch_size
+ overhead_per_token = total_memory_per_gpu / current_batch
+ potential_max_batch = int(available_memory / overhead_per_token)
+ max_batch_size = max(1, potential_max_batch)
+
+ return fits_on_gpu, utilization_percent, max_batch_size
+
+ def _create_result(
+ self,
+ breakdown: InferenceMemoryBreakdown,
+ ) -> InferenceMemoryResult:
+ """Create an InferenceMemoryResult from breakdown.
+
+ Args:
+ breakdown: Memory breakdown by component
+
+ Returns:
+ Complete InferenceMemoryResult
+ """
+ total_memory_per_gpu = breakdown.total_memory_gb
+ num_gpus = self.inference_config.tensor_parallel_size
+ total_memory_all_gpus = total_memory_per_gpu * num_gpus
+
+ fits_on_gpu, utilization_percent, max_batch_size = self._check_feasibility(
+ total_memory_per_gpu
+ )
+
+ # Estimate throughput (simplified heuristic)
+ estimated_throughput = None
+ if fits_on_gpu:
+ # Rough estimate: assumes each token takes ~50ms to process
+ # This varies wildly based on hardware and model
+ tokens_per_batch = self.inference_config.batch_size * self._get_effective_seq_len()
+ estimated_throughput = tokens_per_batch / 0.05 # 50ms per batch
+
+ return InferenceMemoryResult(
+ total_memory_per_gpu_gb=total_memory_per_gpu,
+ total_memory_all_gpus_gb=total_memory_all_gpus,
+ breakdown=breakdown,
+ fits_on_gpu=fits_on_gpu,
+ memory_utilization_percent=utilization_percent,
+ max_supported_batch_size=max_batch_size,
+ estimated_throughput_tokens_per_sec=estimated_throughput,
+ )
+
+ def _get_effective_seq_len(self) -> int:
+ """Get effective sequence length for inference."""
+ return self.inference_config.max_seq_len or self.model_config.max_seq_len
+
+ def _get_kv_cache_bytes_per_token(self) -> int:
+ """Calculate KV cache bytes per token.
+
+ Returns:
+ Bytes per token for KV cache (considering quantization)
+ """
+ # Base: 2 * num_layers * num_heads * head_dim * bytes_per_value
+ # For each token, we store K and V for each layer
+ num_layers = self.model_config.num_layers
+ num_heads = self.model_config.num_attention_heads
+ head_dim = self.model_config.hidden_size // num_heads
+
+ # Determine bytes per value based on quantization
+ quantization = self.inference_config.kv_cache_quantization
+ bytes_per_value = {
+ "none": 2, # FP16/BF16
+ "int8": 1,
+ "fp8": 1,
+ "int4": 0.5,
+ }[quantization.value]
+
+ # KV cache = 2 (K and V) * num_layers * num_heads * head_dim * bytes_per_value
+ kv_bytes_per_token = 2 * num_layers * num_heads * head_dim * bytes_per_value
+
+ return int(kv_bytes_per_token)
+
+ def _calculate_model_params_bytes(self) -> int:
+ """Calculate model parameter memory in bytes.
+
+ Returns:
+ Bytes needed for model parameters
+ """
+ dtype_bytes = {
+ "fp32": 4,
+ "fp16": 2,
+ "bf16": 2,
+ "int8": 1,
+ "int4": 0.5,
+ }
+
+ # Assume model is loaded in BF16/FP16 for inference
+ dtype = "bf16"
+ num_params = self.model_config.num_parameters
+
+ return int(num_params * dtype_bytes[dtype])
+
+ def _calculate_kv_cache_bytes(self, batch_size: int) -> int:
+ """Calculate KV cache memory in bytes.
+
+ Args:
+ batch_size: Batch size to calculate for
+
+ Returns:
+ Bytes needed for KV cache
+ """
+ if not self.inference_config.use_kv_cache:
+ return 0
+
+ seq_len = self._get_effective_seq_len()
+ kv_bytes_per_token = self._get_kv_cache_bytes_per_token()
+
+ # KV cache = batch_size * seq_len * kv_bytes_per_token
+ return batch_size * seq_len * kv_bytes_per_token
diff --git a/src/gpu_mem_calculator/inference/calculator.py b/src/gpu_mem_calculator/inference/calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd8f03a8f2cb6c96b3a4541321ec908e2e477fa
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/calculator.py
@@ -0,0 +1,104 @@
+"""Main inference memory calculator.
+
+Orchestrates the inference memory calculation by selecting the appropriate
+inference engine and aggregating results.
+"""
+
+from gpu_mem_calculator.core.models import (
+ GPUConfig,
+ InferenceConfig,
+ InferenceEngineType,
+ InferenceMemoryResult,
+ ModelConfig,
+)
+from gpu_mem_calculator.inference.huggingface import HuggingFaceEngine
+from gpu_mem_calculator.inference.sglang import SGLangEngine
+from gpu_mem_calculator.inference.tensorrt_llm import TensorRTLLMEngine
+from gpu_mem_calculator.inference.tgi import TGIEngine
+from gpu_mem_calculator.inference.vllm import VLLMEngine
+
+# Type alias for inference engine types
+InferenceEngineAlias = HuggingFaceEngine | VLLMEngine | TGIEngine | TensorRTLLMEngine | SGLangEngine
+
+
+class InferenceMemoryCalculator:
+ """Main inference memory calculator.
+
+ This class provides a high-level interface for calculating
+ GPU memory requirements for LLM inference with different engines.
+ """
+
+ def __init__(
+ self,
+ model_config: ModelConfig,
+ inference_config: InferenceConfig,
+ gpu_config: GPUConfig | None = None,
+ ) -> None:
+ """Initialize the inference calculator.
+
+ Args:
+ model_config: Model architecture configuration
+ inference_config: Inference hyperparameters
+ gpu_config: Hardware configuration (default: 1x 80GB GPU)
+ """
+ self.model_config = model_config
+ self.inference_config = inference_config
+ self.gpu_config = gpu_config or GPUConfig()
+
+ def calculate(self, engine_type: InferenceEngineType) -> InferenceMemoryResult:
+ """Calculate inference GPU memory requirements.
+
+ Selects the appropriate inference engine based on the specified type
+ and returns the memory calculation result.
+
+ Args:
+ engine_type: The inference engine to use
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ engine = self._get_engine(engine_type)
+ return engine.calculate_memory()
+
+ def _get_engine(self, engine_type: InferenceEngineType) -> InferenceEngineAlias:
+ """Get the appropriate inference engine instance.
+
+ Args:
+ engine_type: The type of inference engine
+
+ Returns:
+ Engine instance configured with current settings
+ """
+ match engine_type:
+ case InferenceEngineType.HUGGINGFACE:
+ return HuggingFaceEngine(
+ model_config=self.model_config,
+ inference_config=self.inference_config,
+ gpu_config=self.gpu_config,
+ )
+ case InferenceEngineType.VLLM:
+ return VLLMEngine(
+ model_config=self.model_config,
+ inference_config=self.inference_config,
+ gpu_config=self.gpu_config,
+ )
+ case InferenceEngineType.TGI:
+ return TGIEngine(
+ model_config=self.model_config,
+ inference_config=self.inference_config,
+ gpu_config=self.gpu_config,
+ )
+ case InferenceEngineType.TENSORRT_LLM | InferenceEngineType.TRTLLM:
+ return TensorRTLLMEngine(
+ model_config=self.model_config,
+ inference_config=self.inference_config,
+ gpu_config=self.gpu_config,
+ )
+ case InferenceEngineType.SGLANG:
+ return SGLangEngine(
+ model_config=self.model_config,
+ inference_config=self.inference_config,
+ gpu_config=self.gpu_config,
+ )
+ case _:
+ raise ValueError(f"Unknown inference engine type: {engine_type}")
diff --git a/src/gpu_mem_calculator/inference/huggingface.py b/src/gpu_mem_calculator/inference/huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..53f36bf1b786b267ccd45e3d9202ea98a4bc61ca
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/huggingface.py
@@ -0,0 +1,95 @@
+"""HuggingFace Transformers inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+ InferenceMemoryBreakdown,
+ InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class HuggingFaceEngine(BaseInferenceEngine):
+ """HuggingFace Transformers inference engine.
+
+ Standard HuggingFace inference with optimizations like
+ Flash Attention and torch.compile.
+ """
+
+ def calculate_memory(self) -> InferenceMemoryResult:
+ """Calculate memory requirements for HF Transformers inference.
+
+ HF Transformers memory breakdown:
+ - Model parameters: Standard loading
+ - KV cache: Standard implementation
+ - Activations: Depends on optimization level
+ - Overhead: PyTorch runtime, model loading
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ batch_size = self.inference_config.batch_size
+ tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+ # 1. Model parameters (sharded if using tensor parallel)
+ model_params_bytes = self._calculate_model_params_bytes()
+ model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+ # 2. KV cache (standard implementation)
+ kv_cache_bytes = self._calculate_kv_cache_bytes(batch_size)
+ kv_cache_gb = kv_cache_bytes / (1024**3)
+
+ # 3. Activations (standard PyTorch)
+ activations_bytes = self._calculate_hf_activations(batch_size)
+ activations_gb = activations_bytes / (1024**3)
+
+ # 4. HF overhead
+ overhead_gb = self._calculate_hf_overhead()
+
+ breakdown = InferenceMemoryBreakdown(
+ model_params_gb=model_params_per_gpu_gb,
+ kv_cache_gb=kv_cache_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_hf_activations(self, batch_size: int) -> int:
+ """Calculate activation memory for HF Transformers.
+
+ Standard PyTorch activation memory without kernel fusion.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ Activation memory in bytes
+ """
+ seq_len = self._get_effective_seq_len()
+ hidden_size = self.model_config.hidden_size
+ num_layers = self.model_config.num_layers
+
+ # Standard activation memory (forward pass only)
+ bytes_per_value = 2 # FP16/BF16
+
+ activation_bytes = batch_size * seq_len * hidden_size * num_layers * bytes_per_value
+
+ return int(activation_bytes)
+
+ def _calculate_hf_overhead(self) -> float:
+ """Calculate HF Transformers-specific overhead.
+
+ Includes:
+ - PyTorch runtime
+ - Model loading overhead
+ - Autograd graph (even for inference, some overhead remains)
+
+ Returns:
+ Overhead in GB
+ """
+ # Base PyTorch overhead: ~150MB
+ base_overhead_gb = 0.15
+
+ # Model loading overhead: ~50MB
+ loading_overhead_gb = 0.05
+
+ return base_overhead_gb + loading_overhead_gb
diff --git a/src/gpu_mem_calculator/inference/sglang.py b/src/gpu_mem_calculator/inference/sglang.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3334ca431e7d0879300c3edc180c8d559f54d4
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/sglang.py
@@ -0,0 +1,200 @@
+"""SGLang inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+ InferenceMemoryBreakdown,
+ InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class SGLangEngine(BaseInferenceEngine):
+ """SGLang inference engine with RadixAttention memory management.
+
+ SGLang uses RadixAttention to efficiently manage KV cache memory
+ with tree-based cache sharing and chunked prefill.
+ """
+
+ def calculate_memory(self) -> InferenceMemoryResult:
+ """Calculate memory requirements for SGLang inference.
+
+ SGLang memory breakdown:
+ - Model parameters: Loaded once, shared across all GPUs
+ - KV cache: Managed with RadixAttention (tree-based sharing)
+ - Activations: Temporary during forward pass
+ - Overhead: Scheduler, RadixCache tree, worker overhead
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ batch_size = self.inference_config.batch_size
+ tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+ # 1. Model parameters (sharded across tensor parallel GPUs)
+ model_params_bytes = self._calculate_model_params_bytes()
+ model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+ # 2. KV cache with RadixAttention
+ kv_cache_bytes = self._calculate_sglang_kv_cache(batch_size)
+ kv_cache_gb = kv_cache_bytes / (1024**3)
+
+ # 3. Activations (temporary, per batch)
+ activations_bytes = self._calculate_activations(batch_size)
+ activations_gb = activations_bytes / (1024**3)
+
+ # 4. SGLang overhead (RadixCache, scheduler, etc.)
+ overhead_gb = self._calculate_sglang_overhead()
+
+ breakdown = InferenceMemoryBreakdown(
+ model_params_gb=model_params_per_gpu_gb,
+ kv_cache_gb=kv_cache_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_sglang_kv_cache(self, batch_size: int) -> int:
+ """Calculate KV cache memory for SGLang with RadixAttention.
+
+ SGLang's RadixAttention uses a tree-based structure for KV cache
+ sharing, which is more memory-efficient than traditional approaches.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ KV cache memory in bytes
+ """
+ if not self.inference_config.use_kv_cache:
+ return 0
+
+ # Use chunk size for memory estimation (default: 8192)
+ chunk_size = self.inference_config.chunk_size or 8192
+
+ # Calculate effective sequence length
+ seq_len = self._get_effective_seq_len()
+
+ # RadixCache shares common prefixes across requests
+ # This provides significant memory savings for concurrent requests
+ max_running_requests = self.inference_config.max_running_requests or batch_size * 4
+
+ # Base KV cache memory
+ kv_bytes_per_token = self._get_kv_cache_bytes_per_token()
+
+ # RadixCache provides ~30% memory savings from prefix sharing
+ cache_sharing_factor = 0.7 if not self.inference_config.disable_radix_cache else 1.0
+
+ # Calculate total tokens with chunking
+ total_tokens = batch_size * min(seq_len, chunk_size)
+
+ # Apply RadixCache sharing factor
+ total_kv_bytes = total_tokens * kv_bytes_per_token * cache_sharing_factor
+
+ # Account for max running requests (concurrent requests share cache)
+ concurrent_requests_factor = min(max_running_requests / batch_size, 2.0)
+ total_kv_bytes = int(total_kv_bytes * concurrent_requests_factor)
+
+ return total_kv_bytes
+
+ def _calculate_activations(self, batch_size: int) -> int:
+ """Calculate activation memory for SGLang.
+
+ SGLang uses chunked prefill and optimized attention kernels
+ to reduce activation memory.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ Activation memory in bytes
+ """
+ seq_len = self._get_effective_seq_len()
+ chunk_size = self.inference_config.chunk_size or 8192
+
+ # Chunked prefill processes sequences in chunks
+ effective_seq_len = min(seq_len, chunk_size)
+
+ hidden_size = self.model_config.hidden_size
+ num_layers = self.model_config.num_layers
+
+ # Base activation memory
+ bytes_per_value = 2 # FP16/BF16
+
+ activation_bytes = (
+ batch_size
+ * effective_seq_len
+ * hidden_size
+ * num_layers
+ * bytes_per_value
+ * 2 # Forward pass only (no backward)
+ )
+
+ # SGLang optimizations:
+ # - Chunked prefill: ~40% reduction
+ # - FlashInfer/triton kernels: ~30% reduction
+ # - torch.compile if enabled: ~20% additional reduction
+ activation_bytes = int(activation_bytes * 0.6) # 40% reduction from chunked prefill
+
+ if self.inference_config.enable_torch_compile:
+ activation_bytes = int(activation_bytes * 0.8) # Additional 20% reduction
+
+ return activation_bytes
+
+ def _calculate_sglang_overhead(self) -> float:
+ """Calculate SGLang-specific overhead.
+
+ Includes:
+ - RadixCache tree structure
+ - Scheduler memory
+ - P2P communication buffers (if enabled)
+ - Custom all-reduce buffers (if not disabled)
+ - Multi-LoRA serving overhead (if enabled)
+
+ Returns:
+ Overhead in GB
+ """
+ # Base overhead: ~100-200MB for scheduler and RadixCache tree
+ base_overhead_gb = 0.15
+
+ # RadixCache tree structure overhead
+ # Tree nodes: ~32 bytes per node for metadata
+ if not self.inference_config.disable_radix_cache:
+ seq_len = self._get_effective_seq_len()
+ batch_size = self.inference_config.batch_size
+ num_nodes = batch_size * seq_len // 100 # Rough estimate
+ tree_overhead_gb = (num_nodes * 32) / (1024**3)
+ else:
+ tree_overhead_gb = 0.0
+
+ # P2P attention overhead (if enabled)
+ if self.inference_config.enable_p2p:
+ p2p_overhead_gb = 0.1 # ~100MB for P2P buffers
+ else:
+ p2p_overhead_gb = 0.0
+
+ # Custom all-reduce overhead (if not disabled)
+ if not self.inference_config.disable_custom_all_reduce:
+ all_reduce_overhead_gb = 0.08 # ~80MB for all-reduce buffers
+ else:
+ all_reduce_overhead_gb = 0.0
+
+ # Multi-LoRA serving overhead
+ if self.inference_config.multi_lora_enabled:
+ lora_overhead_gb = 0.2 # ~200MB for LoRA adapters
+ else:
+ lora_overhead_gb = 0.0
+
+ # Speculative decoding overhead
+ if self.inference_config.speculative_algo != "default":
+ speculate_overhead_gb = 0.15 # ~150MB for draft models
+ else:
+ speculate_overhead_gb = 0.0
+
+ return (
+ base_overhead_gb
+ + tree_overhead_gb
+ + p2p_overhead_gb
+ + all_reduce_overhead_gb
+ + lora_overhead_gb
+ + speculate_overhead_gb
+ )
diff --git a/src/gpu_mem_calculator/inference/tensorrt_llm.py b/src/gpu_mem_calculator/inference/tensorrt_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e9e410001d2f05e453b625f62ba6322b3847e4
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/tensorrt_llm.py
@@ -0,0 +1,104 @@
+"""TensorRT-LLM inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+ InferenceMemoryBreakdown,
+ InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class TensorRTLLMEngine(BaseInferenceEngine):
+ """TensorRT-LLM inference engine with optimized inference kernels.
+
+ TensorRT-LLM provides highly optimized inference through:
+ - Weight-only quantization (INT4/INT8)
+ - Fused attention kernels
+ - In-flight batching
+ - Custom CUDA kernels
+ """
+
+ def calculate_memory(self) -> InferenceMemoryResult:
+ """Calculate memory requirements for TensorRT-LLM inference.
+
+ TensorRT-LLM memory breakdown:
+ - Model parameters: Can be quantized (INT4/INT8/FP8)
+ - KV cache: With INT8 quantization support
+ - Activations: Minimal with fused kernels
+ - Overhead: TensorRT runtime, engine workspace
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ batch_size = self.inference_config.batch_size
+ tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+ # 1. Model parameters (quantized options available)
+ model_params_bytes = self._calculate_model_params_bytes()
+ model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+ # 2. KV cache (INT8 optimized)
+ kv_cache_bytes = self._calculate_kv_cache_bytes(batch_size)
+ kv_cache_gb = kv_cache_bytes / (1024**3)
+
+ # 3. Activations (minimal with fused kernels)
+ activations_bytes = self._calculate_tensorrt_activations(batch_size)
+ activations_gb = activations_bytes / (1024**3)
+
+ # 4. TensorRT-LLM overhead
+ overhead_gb = self._calculate_tensorrt_overhead()
+
+ breakdown = InferenceMemoryBreakdown(
+ model_params_gb=model_params_per_gpu_gb,
+ kv_cache_gb=kv_cache_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_tensorrt_activations(self, batch_size: int) -> int:
+ """Calculate activation memory for TensorRT-LLM.
+
+ TensorRT-LLM uses heavily fused kernels which minimize
+ activation memory.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ Activation memory in bytes
+ """
+ seq_len = self._get_effective_seq_len()
+ hidden_size = self.model_config.hidden_size
+ num_layers = self.model_config.num_layers
+
+ # TensorRT-LLM fuses many operations, reducing activation memory
+ # Rough estimate: ~30% of standard activation memory
+ bytes_per_value = 2 # FP16/BF16
+
+ activation_bytes = batch_size * seq_len * hidden_size * num_layers * bytes_per_value * 0.3
+
+ return int(activation_bytes)
+
+ def _calculate_tensorrt_overhead(self) -> float:
+ """Calculate TensorRT-LLM-specific overhead.
+
+ Includes:
+ - TensorRT runtime
+ - Engine workspace for temporary buffers
+ - In-flight batching bookkeeping
+ - Custom kernel overhead
+
+ Returns:
+ Overhead in GB
+ """
+ # TensorRT runtime: ~100MB
+ runtime_overhead_gb = 0.1
+
+ # Engine workspace: scales with model size
+ workspace_overhead_gb = 0.2
+
+ # In-flight batching structures
+ batching_overhead_gb = 0.05
+
+ return runtime_overhead_gb + workspace_overhead_gb + batching_overhead_gb
diff --git a/src/gpu_mem_calculator/inference/tgi.py b/src/gpu_mem_calculator/inference/tgi.py
new file mode 100644
index 0000000000000000000000000000000000000000..21305d5b2f45913a4b0b7a7885d7b88d1b068f7f
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/tgi.py
@@ -0,0 +1,109 @@
+"""TGI (Text Generation Inference) engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+ InferenceMemoryBreakdown,
+ InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class TGIEngine(BaseInferenceEngine):
+ """Text Generation Inference (TGI) engine by HuggingFace.
+
+ TGI is a production-ready inference server with optimized
+ attention mechanisms and memory management.
+ """
+
+ def calculate_memory(self) -> InferenceMemoryResult:
+ """Calculate memory requirements for TGI inference.
+
+ TGI memory breakdown:
+ - Model parameters: Loaded in specified dtype
+ - KV cache: With optional quantization (INT8/FP8)
+ - Activations: Flash Attention optimized
+ - Overhead: TGI server, router, preallocation
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ batch_size = self.inference_config.batch_size
+ tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+ # 1. Model parameters (sharded across tensor parallel GPUs)
+ model_params_bytes = self._calculate_model_params_bytes()
+ model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+ # 2. KV cache (with quantization support)
+ kv_cache_bytes = self._calculate_kv_cache_bytes(batch_size)
+ kv_cache_gb = kv_cache_bytes / (1024**3)
+
+ # 3. Activations (Flash Attention optimized)
+ activations_bytes = self._calculate_tgi_activations(batch_size)
+ activations_gb = activations_bytes / (1024**3)
+
+ # 4. TGI overhead
+ overhead_gb = self._calculate_tgi_overhead()
+
+ breakdown = InferenceMemoryBreakdown(
+ model_params_gb=model_params_per_gpu_gb,
+ kv_cache_gb=kv_cache_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_tgi_activations(self, batch_size: int) -> int:
+ """Calculate activation memory for TGI.
+
+ TGI uses Flash Attention which significantly reduces
+ activation memory by materializing less attention matrices.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ Activation memory in bytes
+ """
+ seq_len = self._get_effective_seq_len()
+ hidden_size = self.model_config.hidden_size
+ num_layers = self.model_config.num_layers
+
+ # Flash Attention reduces memory by not materializing full attention matrix
+ # Rough estimate for activation memory
+ bytes_per_value = 2 # FP16/BF16
+
+ # TGI uses optimized kernels: ~40% of standard activation memory
+ activation_bytes = batch_size * seq_len * hidden_size * num_layers * bytes_per_value * 0.4
+
+ return int(activation_bytes)
+
+ def _calculate_tgi_overhead(self) -> float:
+ """Calculate TGI-specific overhead.
+
+ Includes:
+ - TGI server and router
+ - nccl communication for tensor parallel
+ - Preallocated buffers for efficiency
+ - Dynamic batching overhead
+
+ Returns:
+ Overhead in GB
+ """
+ # Base server overhead: ~200MB
+ base_overhead_gb = 0.2
+
+ # Tensor parallel communication overhead
+ if self.inference_config.tensor_parallel_size > 1:
+ # nccl overhead scales with TP size
+ tp_overhead_gb = self.inference_config.tensor_parallel_size * 0.05
+ else:
+ tp_overhead_gb = 0.0
+
+ # Dynamic batching bookkeeping
+ batch_overhead_gb = 0.05
+
+ # Preallocated buffers for Flash Attention
+ buffer_overhead_gb = 0.1
+
+ return base_overhead_gb + tp_overhead_gb + batch_overhead_gb + buffer_overhead_gb
diff --git a/src/gpu_mem_calculator/inference/vllm.py b/src/gpu_mem_calculator/inference/vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ad11fb98b6dfd904b97c310c88098be9308f8f
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/vllm.py
@@ -0,0 +1,150 @@
+"""vLLM inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+ InferenceMemoryBreakdown,
+ InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class VLLMEngine(BaseInferenceEngine):
+ """vLLM inference engine with PagedAttention memory management.
+
+ vLLM uses PagedAttention to efficiently manage KV cache memory
+ with block-based allocation.
+ """
+
+ def calculate_memory(self) -> InferenceMemoryResult:
+ """Calculate memory requirements for vLLM inference.
+
+ vLLM memory breakdown:
+ - Model parameters: Loaded once, shared across all GPUs
+ - KV cache: Managed in blocks with PagedAttention
+ - Activations: Temporary during forward pass
+ - Overhead: vLLM scheduler, worker overhead, block tables
+
+ Returns:
+ InferenceMemoryResult with complete memory breakdown
+ """
+ batch_size = self.inference_config.batch_size
+ tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+ # 1. Model parameters (sharded across tensor parallel GPUs)
+ model_params_bytes = self._calculate_model_params_bytes()
+ model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+ # 2. KV cache with PagedAttention (block-based allocation)
+ kv_cache_bytes = self._calculate_vllm_kv_cache(batch_size)
+ kv_cache_gb = kv_cache_bytes / (1024**3)
+
+ # 3. Activations (temporary, per batch)
+ activations_bytes = self._calculate_activations(batch_size)
+ activations_gb = activations_bytes / (1024**3)
+
+ # 4. vLLM overhead (scheduler, block manager, etc.)
+ overhead_gb = self._calculate_vllm_overhead()
+
+ breakdown = InferenceMemoryBreakdown(
+ model_params_gb=model_params_per_gpu_gb,
+ kv_cache_gb=kv_cache_gb,
+ activations_gb=activations_gb,
+ overhead_gb=overhead_gb,
+ )
+
+ return self._create_result(breakdown)
+
+ def _calculate_vllm_kv_cache(self, batch_size: int) -> int:
+ """Calculate KV cache memory for vLLM with PagedAttention.
+
+ vLLM uses block-based KV cache management, which is more efficient
+ than contiguous allocation. Each block contains multiple token slots.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ KV cache memory in bytes
+ """
+ block_size = self.inference_config.block_size or 16
+
+ # Calculate total tokens needed
+ seq_len = self._get_effective_seq_len()
+ total_tokens = batch_size * seq_len
+
+ # Calculate number of blocks needed
+ num_blocks = (total_tokens + block_size - 1) // block_size
+
+ # Add 20% buffer for dynamic allocation during generation
+ num_blocks = int(num_blocks * 1.2)
+
+ # KV cache memory with block allocation
+ kv_bytes_per_token = self._get_kv_cache_bytes_per_token()
+ total_kv_bytes = num_blocks * block_size * kv_bytes_per_token
+
+ return total_kv_bytes
+
+ def _calculate_activations(self, batch_size: int) -> int:
+ """Calculate activation memory for vLLM.
+
+ vLLM optimizes activation memory with kernel fusion
+ and efficient attention implementation.
+
+ Args:
+ batch_size: Batch size
+
+ Returns:
+ Activation memory in bytes
+ """
+ seq_len = self._get_effective_seq_len()
+ hidden_size = self.model_config.hidden_size
+ num_layers = self.model_config.num_layers
+
+ # Simplified activation calculation
+ # vLLM uses kernel fusion to reduce activation memory
+ # Rough estimate: batch * seq * hidden * layers * bytes_per_value
+ bytes_per_value = 2 # FP16/BF16
+
+ # Base activation memory
+ activation_bytes = (
+ batch_size
+ * seq_len
+ * hidden_size
+ * num_layers
+ * bytes_per_value
+ * 2 # Forward pass only (no backward)
+ )
+
+ # vLLM optimization: ~50% reduction with kernel fusion
+ activation_bytes = int(activation_bytes * 0.5)
+
+ return activation_bytes
+
+ def _calculate_vllm_overhead(self) -> float:
+ """Calculate vLLM-specific overhead.
+
+ Includes:
+ - Scheduler memory
+ - Block table management
+ - Worker process overhead
+ - CUDA graphs and preallocated buffers
+
+ Returns:
+ Overhead in GB
+ """
+ # Base overhead: ~100-200MB for scheduler and block manager
+ base_overhead_gb = 0.15
+
+ # Additional overhead for block tables
+ # Each block entry: 8 bytes (pointer)
+ block_size = self.inference_config.block_size or 16
+ seq_len = self._get_effective_seq_len()
+ batch_size = self.inference_config.batch_size
+
+ num_blocks = (batch_size * seq_len + block_size - 1) // block_size * 1.2
+ block_table_bytes = num_blocks * 8
+ block_table_gb = block_table_bytes / (1024**3)
+
+ # Preallocated buffers for CUDA kernels (~50MB)
+ buffer_overhead_gb = 0.05
+
+ return base_overhead_gb + block_table_gb + buffer_overhead_gb
diff --git a/src/gpu_mem_calculator/py.typed b/src/gpu_mem_calculator/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/gpu_mem_calculator/utils/__init__.py b/src/gpu_mem_calculator/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f9e9765313e6ef76edb438a70b9571b607788b
--- /dev/null
+++ b/src/gpu_mem_calculator/utils/__init__.py
@@ -0,0 +1,5 @@
+"""Utility functions."""
+
+from gpu_mem_calculator.utils.precision import Precision, get_precision_from_dtype
+
+__all__ = ["Precision", "get_precision_from_dtype"]
diff --git a/src/gpu_mem_calculator/utils/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e95827a345cf5edf0fe115ea57c4aaf1e68a5c
Binary files /dev/null and b/src/gpu_mem_calculator/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/utils/__pycache__/precision.cpython-312.pyc b/src/gpu_mem_calculator/utils/__pycache__/precision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc8bb7fe7036e3ef35a15e239cffcfb5390f4fea
Binary files /dev/null and b/src/gpu_mem_calculator/utils/__pycache__/precision.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/utils/precision.py b/src/gpu_mem_calculator/utils/precision.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a96cbe599bbc5c853756241e67e00207527e56a
--- /dev/null
+++ b/src/gpu_mem_calculator/utils/precision.py
@@ -0,0 +1,83 @@
+"""Precision and data type utilities."""
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Precision:
+ """Precision information for a data type."""
+
+ name: str
+ bits_per_param: int
+ bytes_per_param: float
+ is_integer: bool = False
+
+
+# Standard precision definitions
+PRECISION_MAP = {
+ "fp32": Precision(name="FP32", bits_per_param=32, bytes_per_param=4.0),
+ "fp16": Precision(name="FP16", bits_per_param=16, bytes_per_param=2.0),
+ "bf16": Precision(name="BF16", bits_per_param=16, bytes_per_param=2.0),
+ "int8": Precision(name="INT8", bits_per_param=8, bytes_per_param=1.0, is_integer=True),
+ "int4": Precision(name="INT4", bits_per_param=4, bytes_per_param=0.5, is_integer=True),
+}
+
+
+def get_precision_from_dtype(dtype: str) -> Precision:
+ """Get precision info from dtype string.
+
+ Args:
+ dtype: Data type string (e.g., "fp32", "fp16", "bf16", "int8", "int4")
+
+ Returns:
+ Precision object with bytes per parameter information
+
+ Raises:
+ ValueError: If dtype is not supported
+ """
+ dtype_lower = dtype.lower()
+ if dtype_lower not in PRECISION_MAP:
+ raise ValueError(
+ f"Unsupported dtype: {dtype}. Supported types: {list(PRECISION_MAP.keys())}"
+ )
+ return PRECISION_MAP[dtype_lower]
+
+
+def bytes_from_params(num_params: int, dtype: str) -> float:
+ """Calculate memory in bytes for a given number of parameters.
+
+ Args:
+ num_params: Number of parameters
+ dtype: Data type string
+
+ Returns:
+ Memory in bytes
+ """
+ precision = get_precision_from_dtype(dtype)
+ return num_params * precision.bytes_per_param
+
+
+def gb_from_bytes(num_bytes: float) -> float:
+ """Convert bytes to gigabytes.
+
+ Args:
+ num_bytes: Number of bytes
+
+ Returns:
+ Number of gigabytes
+ """
+ return num_bytes / (1024**3)
+
+
+def gb_from_params(num_params: int, dtype: str) -> float:
+ """Calculate memory in GB for a given number of parameters.
+
+ Args:
+ num_params: Number of parameters
+ dtype: Data type string
+
+ Returns:
+ Memory in GB
+ """
+ bytes_val = bytes_from_params(num_params, dtype)
+ return gb_from_bytes(bytes_val)
diff --git a/web/__init__.py b/web/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26048ff005fcd993b241f5ec241630ea36ff38b
--- /dev/null
+++ b/web/__init__.py
@@ -0,0 +1 @@
+"""Web application for GPU Memory Calculator."""
diff --git a/web/__pycache__/__init__.cpython-312.pyc b/web/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..477381308b81a89e9bbe50f65ccfbdd762d178d2
Binary files /dev/null and b/web/__pycache__/__init__.cpython-312.pyc differ
diff --git a/web/__pycache__/app.cpython-312.pyc b/web/__pycache__/app.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b76f75481ba22d3b1a8f8dd76ff94343f4d997f6
Binary files /dev/null and b/web/__pycache__/app.cpython-312.pyc differ
diff --git a/web/app.py b/web/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..66fc48171a4f13ee69a14fce7bb348b95b68bb88
--- /dev/null
+++ b/web/app.py
@@ -0,0 +1,1161 @@
+"""FastAPI backend for GPU Memory Calculator web application."""
+
+import hashlib
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel, Field, field_validator, model_validator
+from starlette.requests import Request
+
+from gpu_mem_calculator.config.presets import load_presets
+from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+from gpu_mem_calculator.core.models import (
+ EngineConfig,
+ GPUConfig,
+ InferenceConfig,
+ InferenceEngineType,
+ InterconnectType,
+ MemoryResult,
+ ModelConfig,
+ NodeConfig,
+ ParallelismConfig,
+ TrainingConfig,
+)
+from gpu_mem_calculator.core.multinode import MultiNodeCalculator
+from gpu_mem_calculator.exporters.manager import ExportFormat, ExportManager
+from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Create FastAPI app
+app = FastAPI(
+ title="GPU Memory Calculator",
+ description="Calculate GPU memory requirements for LLM training",
+ version="0.1.0",
+)
+
+# Configure CORS
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+# Setup templates and static files
+BASE_DIR = Path(__file__).parent
+templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
+
+# Mount static files
+static_dir = BASE_DIR / "static"
+if static_dir.exists():
+ app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
+
+
+# Request/Response models
+class CalculateRequest(BaseModel):
+ """Request model for memory calculation with comprehensive validation."""
+
+ model: dict[str, Any] = Field(description="Model configuration")
+ training: dict[str, Any] = Field(description="Training configuration")
+ parallelism: dict[str, Any] | None = Field(
+ default=None,
+ description="Parallelism configuration",
+ )
+ engine: dict[str, Any] | None = Field(default=None, description="Engine configuration")
+ hardware: dict[str, Any] | None = Field(default=None, description="Hardware configuration")
+
+ @field_validator("model")
+ @classmethod
+ def validate_moe_settings(cls, v: dict[str, Any]) -> dict[str, Any]:
+ """Validate MoE-specific constraints."""
+ if v.get("moe_enabled"):
+ num_experts = v.get("num_experts", 1)
+ top_k = v.get("top_k", 1)
+
+ if top_k > num_experts:
+ raise ValueError(f"MoE top_k ({top_k}) cannot exceed num_experts ({num_experts})")
+
+ if num_experts < 1 or num_experts > 256:
+ raise ValueError(f"num_experts must be between 1 and 256, got {num_experts}")
+
+ if top_k < 1 or top_k > 8:
+ raise ValueError(f"top_k must be between 1 and 8, got {top_k}")
+
+ return v
+
+ @model_validator(mode="after")
+ def validate_parallelism_consistency(self) -> "CalculateRequest":
+ """Validate parallelism settings consistency."""
+ if self.parallelism and self.hardware:
+ tensor_pp = self.parallelism.get("tensor_parallel_size", 1)
+ pipeline_pp = self.parallelism.get("pipeline_parallel_size", 1)
+ data_pp = self.parallelism.get("data_parallel_size", 1)
+ num_gpus = self.hardware.get("num_gpus", 1)
+
+ effective_gpus = tensor_pp * pipeline_pp * data_pp
+
+ if effective_gpus != num_gpus:
+ raise ValueError(
+ f"Parallelism mismatch: tensor_pp ({tensor_pp}) ร "
+ f"pipeline_pp ({pipeline_pp}) ร data_pp ({data_pp}) = "
+ f"{effective_gpus} GPUs, but num_gpus is set to {num_gpus}. "
+ f"These must match."
+ )
+
+ # Validate sequence parallel requires tensor parallel > 1
+ if self.parallelism and self.parallelism.get("sequence_parallel"):
+ tensor_pp = self.parallelism.get("tensor_parallel_size", 1)
+ if tensor_pp <= 1:
+ raise ValueError(
+ f"Sequence parallelism requires tensor_parallel_size > 1, " f"got {tensor_pp}"
+ )
+
+ return self
+
+ @model_validator(mode="after")
+ def validate_engine_settings(self) -> "CalculateRequest":
+ """Validate engine-specific settings."""
+ if not self.engine:
+ return self
+
+ engine_type = self.engine.get("type")
+ zero_stage = self.engine.get("zero_stage", 0)
+
+ # ZeRO stages only valid for DeepSpeed engines
+ if engine_type not in ["deepspeed", "megatron_deepspeed"] and zero_stage > 0:
+ raise ValueError(
+ f"ZeRO stages are only supported for DeepSpeed engines, "
+ f"got engine_type='{engine_type}' with zero_stage={zero_stage}"
+ )
+
+ # Validate ZeRO stage range
+ if zero_stage < 0 or zero_stage > 3:
+ raise ValueError(f"zero_stage must be between 0 and 3, got {zero_stage}")
+
+ return self
+
+
+class PresetInfo(BaseModel):
+ """Information about a preset model configuration."""
+
+ name: str
+ display_name: str
+ description: str
+ config: dict[str, Any]
+
+
+# Simple in-memory cache for calculation results
+# In production, use Redis or similar
+_calculation_cache: dict[str, tuple[MemoryResult, float]] = {} # key -> (result, timestamp)
+_CACHE_TTL = 3600 # 1 hour
+_MAX_CACHE_SIZE = 1000
+
+
+def _cache_key_from_request(request: CalculateRequest) -> str:
+ """Generate cache key from request."""
+ request_dict = request.model_dump()
+ # Sort keys for consistent hashing
+ request_str = json.dumps(request_dict, sort_keys=True)
+ return hashlib.md5(request_str.encode()).hexdigest()
+
+
+def _get_cached_result(key: str) -> MemoryResult | None:
+ """Get cached result if available and not expired."""
+ if key in _calculation_cache:
+ result, timestamp = _calculation_cache[key]
+ import time
+
+ if time.time() - timestamp < _CACHE_TTL:
+ return result
+ else:
+ # Expired, remove from cache
+ del _calculation_cache[key]
+ return None
+
+
+def _cache_result(key: str, result: MemoryResult) -> None:
+ """Cache calculation result."""
+ import time
+
+ # Simple cache eviction if too large
+ if len(_calculation_cache) >= _MAX_CACHE_SIZE:
+ # Remove oldest entry (first key)
+ oldest_key = next(iter(_calculation_cache))
+ del _calculation_cache[oldest_key]
+
+ _calculation_cache[key] = (result, time.time())
+
+
+# Load presets at startup using shared preset loader
+# The shared loader reads from web/presets/models.json
+def _load_presets_from_shared() -> dict[str, PresetInfo]:
+ """Load presets using the shared preset loader."""
+ all_presets = load_presets()
+ return {
+ name: PresetInfo(
+ name=name,
+ display_name=preset.get("display_name", name),
+ description=preset.get("description", ""),
+ config=preset.get("config", {}),
+ )
+ for name, preset in all_presets.items()
+ }
+
+
+PRESETS = _load_presets_from_shared()
+
+
+# API Routes
+@app.get("/")
+async def index(request: Request) -> Any:
+ """Serve the main web page."""
+ return templates.TemplateResponse("index.html", {"request": request})
+
+
+@app.get("/api/engines")
+async def list_engines() -> dict[str, str]:
+ """List supported training engines."""
+ return {
+ "pytorch_ddp": "PyTorch DDP (Distributed Data Parallel)",
+ "deepspeed": "DeepSpeed ZeRO",
+ "megatron_lm": "Megatron-LM",
+ "fsdp": "PyTorch FSDP (Fully Sharded Data Parallel)",
+ "megatron_deepspeed": "Megatron-LM + DeepSpeed",
+ }
+
+
+@app.get("/api/optimizers")
+async def list_optimizers() -> dict[str, str]:
+ """List supported optimizers."""
+ return {
+ "adam": "Adam",
+ "adamw": "AdamW",
+ "adamw_8bit": "AdamW 8-bit",
+ "sgd": "SGD",
+ }
+
+
+@app.get("/api/dtypes")
+async def list_dtypes() -> dict[str, str]:
+ """List supported data types."""
+ return {
+ "fp32": "FP32 (32-bit floating point)",
+ "fp16": "FP16 (16-bit floating point)",
+ "bf16": "BF16 (16-bit bfloat)",
+ "int8": "INT8 (8-bit integer)",
+ "int4": "INT4 (4-bit integer)",
+ }
+
+
+@app.get("/api/presets")
+async def list_presets() -> dict[str, dict[str, str]]:
+ """List all preset model configurations."""
+ return {
+ name: {
+ "display_name": preset.display_name,
+ "description": preset.description,
+ }
+ for name, preset in PRESETS.items()
+ }
+
+
+@app.get("/api/preset/{preset_name}")
+async def get_preset(preset_name: str) -> dict[str, Any]:
+ """Get a specific preset configuration."""
+ if preset_name not in PRESETS:
+ raise HTTPException(status_code=404, detail=f"Preset '{preset_name}' not found")
+
+ return PRESETS[preset_name].config
+
+
+@app.post("/api/calculate")
+async def calculate_memory(request: CalculateRequest) -> MemoryResult:
+ """Calculate GPU memory requirements.
+
+ Args:
+ request: Calculation request with model, training, and hardware configs
+
+ Returns:
+ MemoryResult with complete memory breakdown
+ """
+ # Check cache first
+ cache_key = _cache_key_from_request(request)
+ cached_result = _get_cached_result(cache_key)
+ if cached_result is not None:
+ logger.info(f"Cache hit for key: {cache_key[:8]}...")
+ return cached_result
+
+ try:
+ # Parse model configuration
+ model_data = request.model.copy()
+ # Parse num_parameters if it's a string (e.g., "7B", "7000M")
+ if "num_parameters" in model_data and isinstance(
+ model_data["num_parameters"],
+ str,
+ ):
+ from gpu_mem_calculator.config.parser import ConfigParser
+
+ model_data["num_parameters"] = ConfigParser._parse_num_params(
+ model_data["num_parameters"],
+ )
+
+ model_config = ModelConfig(**model_data)
+
+ # Parse training configuration
+ training_config = TrainingConfig(**request.training)
+
+ # Parse optional configurations with defaults
+ parallelism_config = (
+ ParallelismConfig(**request.parallelism) if request.parallelism else ParallelismConfig()
+ )
+
+ engine_config = EngineConfig(**request.engine) if request.engine else EngineConfig()
+
+ gpu_config = GPUConfig(**request.hardware) if request.hardware else GPUConfig()
+
+ # Create calculator and compute
+ calculator = GPUMemoryCalculator(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ gpu_config=gpu_config,
+ )
+
+ result = calculator.calculate()
+
+ # Cache the result
+ _cache_result(cache_key, result)
+
+ logger.info(
+ f"Calculation successful: {model_config.name}, "
+ f"{result.total_memory_per_gpu_gb:.2f} GB per GPU"
+ )
+
+ return result
+
+ except ValueError as e:
+ # User input validation error
+ logger.warning(f"Validation error: {str(e)}")
+ raise HTTPException(
+ status_code=400,
+ detail={"error": "Validation error", "message": str(e), "type": "validation_error"},
+ ) from e
+ except Exception as e:
+ # Unexpected system error
+ logger.error(f"Calculation error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500,
+ detail={
+ "error": "Internal server error",
+ "message": "An unexpected error occurred during calculation",
+ },
+ ) from e
+
+
+@app.post("/api/export/deepspeed")
+async def export_deepspeed_config(request: CalculateRequest) -> dict[str, Any]:
+ """Export DeepSpeed configuration file.
+
+ Args:
+ request: Calculation request with model, training, and hardware configs
+
+ Returns:
+ DeepSpeed config JSON and memory result
+ """
+ try:
+ # First calculate memory
+ calc_result = await calculate_memory(request)
+
+ # Generate DeepSpeed config
+ parallelism = request.parallelism or {}
+ training = request.training
+ engine = request.engine or {}
+
+ train_batch_size = (
+ training.get("batch_size", 1)
+ * training.get("gradient_accumulation_steps", 1)
+ * parallelism.get("data_parallel_size", 1)
+ )
+
+ zero_stage = engine.get("zero_stage", 0)
+ offload_optimizer = engine.get("offload_optimizer", "none")
+ offload_param = engine.get("offload_param", "none")
+
+ deepspeed_config = {
+ "train_batch_size": train_batch_size,
+ "train_micro_batch_size_per_gpu": training.get("batch_size", 1),
+ "gradient_accumulation_steps": training.get("gradient_accumulation_steps", 1),
+ "optimizer": {
+ "type": training.get("optimizer", "AdamW"),
+ "params": {"lr": 0.0001, "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 0.01},
+ },
+ "scheduler": {
+ "type": "WarmupLR",
+ "params": {"warmup_min_lr": 0, "warmup_max_lr": 0.0001, "warmup_num_steps": 2000},
+ },
+ "fp16": {"enabled": training.get("dtype") in ["fp16", "int4", "int8"]},
+ "bf16": {"enabled": training.get("dtype") == "bf16"},
+ "zero_optimization": {"stage": zero_stage},
+ "gradient_clipping": training.get("gradient_clipping", 1.0),
+ "steps_per_print": 100,
+ }
+
+ # Add offload config if ZeRO stage >= 1
+ if zero_stage >= 1:
+ deepspeed_config["zero_optimization"]["offload_optimizer"] = {
+ "device": offload_optimizer
+ }
+ deepspeed_config["zero_optimization"]["offload_param"] = {"device": offload_param}
+
+ return {"config": deepspeed_config, "memory_result": calc_result}
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"DeepSpeed export error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to generate DeepSpeed config: {str(e)}"
+ ) from e
+
+
+@app.post("/api/optimize/batch-size")
+async def optimize_batch_size(request: CalculateRequest) -> dict[str, Any]:
+ """Find maximum batch size that fits in GPU memory.
+
+ Uses binary search to find the maximum batch size that doesn't OOM.
+
+ Args:
+ request: Calculation request with model, training, and hardware configs
+
+ Returns:
+ Maximum batch size that fits and corresponding memory result
+ """
+ try:
+ # Create a mutable copy for testing
+ from copy import deepcopy
+
+ min_batch = 1
+ max_batch = 512 # Reasonable upper bound
+ best_batch = 1
+
+ while min_batch <= max_batch:
+ mid = (min_batch + max_batch) // 2
+
+ # Create modified request with test batch size
+ test_request = deepcopy(request)
+ test_request.training["batch_size"] = mid
+
+ try:
+ # Validate and calculate
+ CalculateRequest.model_validate(test_request)
+ result = await calculate_memory(test_request)
+
+ if result.fits_on_gpu:
+ best_batch = mid
+ min_batch = mid + 1
+ else:
+ max_batch = mid - 1
+ except (ValueError, HTTPException):
+ # Invalid config or doesn't fit
+ max_batch = mid - 1
+
+ # Get final result for best batch size
+ final_request = deepcopy(request)
+ final_request.training["batch_size"] = best_batch
+ final_result = await calculate_memory(final_request)
+
+ return {"max_batch_size": best_batch, "memory_result": final_result}
+
+ except Exception as e:
+ logger.error(f"Batch size optimization error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to optimize batch size: {str(e)}"
+ ) from e
+
+
+@app.post("/api/validate")
+async def validate_config(request: CalculateRequest) -> dict[str, Any]:
+ """Validate a configuration without calculating memory.
+
+ Args:
+ request: Configuration to validate
+
+ Returns:
+ Validation result with valid flag and any errors
+ """
+ try:
+ # Pydantic validation happens automatically when creating CalculateRequest
+ # If we get here, the request is valid
+ return {"valid": True, "errors": []}
+
+ except ValueError as e:
+ # Validation error
+ return {"valid": False, "errors": [str(e)]}
+ except Exception as e:
+ # Unexpected error
+ logger.error(f"Validation error: {str(e)}", exc_info=True)
+ return {"valid": False, "errors": [str(e)]}
+
+
+@app.post("/api/explain-formula")
+async def explain_formula(request: CalculateRequest) -> dict[str, Any]:
+ """Explain the memory formula used for calculation.
+
+ Returns detailed information about which formula is being used,
+ with the user's values plugged in, and links to documentation.
+
+ Args:
+ request: Calculation request with model, training, and hardware configs
+
+ Returns:
+ Formula explanation with formula type, breakdown, and references
+ """
+ try:
+ # Get configuration details
+ engine_type = request.engine.get("type", "pytorch_ddp") if request.engine else "pytorch_ddp"
+ num_params = request.model.get("num_parameters", 0)
+
+ # Parse num_parameters if it's a string (e.g., "7B", "7000M")
+ if isinstance(num_params, str):
+ from gpu_mem_calculator.config.parser import ConfigParser
+
+ num_params = ConfigParser._parse_num_params(num_params)
+
+ optimizer = request.training.get("optimizer", "adamw")
+ num_gpus = request.hardware.get("num_gpus", 1) if request.hardware else 1
+ batch_size = request.training.get("batch_size", 1)
+
+ # Calculate memory to get the breakdown
+ result = await calculate_memory(request)
+
+ # Determine formula description based on engine type
+ formula_info = {
+ "engine_type": engine_type,
+ "engine_name": _get_engine_name(engine_type),
+ "formula_components": [],
+ "total_memory_gb": round(result.total_memory_per_gpu_gb, 2),
+ "breakdown": {
+ "model_params_gb": round(result.breakdown.model_params_gb, 2),
+ "gradients_gb": round(result.breakdown.gradients_gb, 2),
+ "optimizer_states_gb": round(result.breakdown.optimizer_states_gb, 2),
+ "activations_gb": round(result.breakdown.activations_gb, 2),
+ "overhead_gb": round(result.breakdown.overhead_gb, 2),
+ },
+ "references": _get_formula_references(engine_type),
+ }
+
+ # Add engine-specific formula details
+ if engine_type == "pytorch_ddp":
+ formula_info["formula_description"] = (
+ "PyTorch DDP stores complete copies of model parameters, gradients, "
+ "and optimizer states on each GPU."
+ )
+ formula_info["formula_components"] = [
+ {
+ "name": "Model Parameters",
+ "formula": f"{num_params:,} ร 2 bytes (FP16/BF16)",
+ "result": f"{result.breakdown.model_params_gb:.2f} GB",
+ "description": "Full model stored on each GPU",
+ },
+ {
+ "name": "Gradients",
+ "formula": f"{num_params:,} ร 2 bytes (FP16)",
+ "result": f"{result.breakdown.gradients_gb:.2f} GB",
+ "description": "Full gradients during backward pass",
+ },
+ {
+ "name": "Optimizer States",
+ "formula": _get_optimizer_formula(optimizer, num_params)["formula"],
+ "result": f"{result.breakdown.optimizer_states_gb:.2f} GB",
+ "description": _get_optimizer_formula(optimizer, num_params)["description"],
+ },
+ ]
+
+ elif engine_type in ["deepspeed", "megatron_deepspeed"]:
+ zero_stage = request.engine.get("zero_stage", 0) if request.engine else 0
+ offload_optimizer = (
+ request.engine.get("offload_optimizer", "none") if request.engine else "none"
+ )
+ offload_param = (
+ request.engine.get("offload_param", "none") if request.engine else "none"
+ )
+
+ if zero_stage == 0:
+ stage_name = "ZeRO-0 (Baseline)"
+ formula_info["formula_description"] = (
+ f"{stage_name}: No memory optimization. Same as PyTorch DDP."
+ )
+ elif zero_stage == 1:
+ stage_name = "ZeRO-1"
+ formula_info["formula_description"] = (
+ f"{stage_name}: Shards optimizer states across {num_gpus} GPUs. "
+ f"Reduces optimizer memory by {num_gpus}x."
+ )
+ elif zero_stage == 2:
+ stage_name = "ZeRO-2"
+ formula_info["formula_description"] = (
+ f"{stage_name}: Shards optimizer states AND gradients across {num_gpus} GPUs. "
+ f"Reduces memory by {num_gpus}x for both components."
+ )
+ elif zero_stage == 3:
+ stage_name = "ZeRO-3"
+ formula_info["formula_description"] = (
+ f"{stage_name}: Shards parameters, gradients, AND optimizer states. "
+ f"Only largest layer stored intact. Linear memory reduction with GPU count."
+ )
+
+ formula_info["zero_stage"] = zero_stage
+ formula_info["offload_optimizer"] = offload_optimizer
+ formula_info["offload_param"] = offload_param
+
+ # Add ZeRO-specific components
+ if zero_stage == 3:
+ # Estimate largest layer (approx 10% of params for typical models)
+ largest_params = num_params // 10
+ formula_info["formula_components"] = [
+ {
+ "name": "Largest Layer",
+ "formula": f"{largest_params:,} ร 4 bytes (FP16 params + grads)",
+ "result": f"{result.breakdown.model_params_gb:.2f} GB",
+ "description": "Gathered during compute, largest layer kept intact",
+ },
+ {
+ "name": "Sharded Parameters",
+ "formula": f"({num_params:,} ร 2 bytes) / {num_gpus} GPUs",
+ "result": "Included in model params",
+ "description": "Remaining parameters sharded across GPUs",
+ },
+ {
+ "name": "Sharded Optimizer States",
+ "formula": (
+ (
+ f"({_get_optimizer_formula(optimizer, num_params)['formula']}) "
+ f"/ {num_gpus} GPUs"
+ )
+ if offload_optimizer == "none"
+ else f"Offloaded to {offload_optimizer}"
+ ),
+ "result": f"{result.breakdown.optimizer_states_gb:.2f} GB",
+ "description": (
+ _get_optimizer_formula(optimizer, num_params)["description"]
+ + " (sharded or offloaded)"
+ ),
+ },
+ ]
+ else:
+ # ZeRO-1 or ZeRO-2
+ formula_info["formula_components"] = [
+ {
+ "name": "Model Parameters",
+ "formula": f"{num_params:,} ร 2 bytes (FP16)",
+ "result": f"{result.breakdown.model_params_gb:.2f} GB",
+ "description": "Full model on each GPU",
+ },
+ {
+ "name": "Gradients",
+ "formula": (
+ f"{num_params:,} ร 2 bytes"
+ if zero_stage < 2
+ else f"({num_params:,} ร 2 bytes) / {num_gpus} GPUs"
+ ),
+ "result": f"{result.breakdown.gradients_gb:.2f} GB",
+ "description": (
+ "Sharded across GPUs" if zero_stage >= 2 else "Full gradients"
+ ),
+ },
+ {
+ "name": "Optimizer States",
+ "formula": (
+ (
+ f"({_get_optimizer_formula(optimizer, num_params)['formula']}) "
+ f"/ {num_gpus} GPUs"
+ )
+ if offload_optimizer == "none"
+ else f"Offloaded to {offload_optimizer}"
+ ),
+ "result": f"{result.breakdown.optimizer_states_gb:.2f} GB",
+ "description": (
+ _get_optimizer_formula(optimizer, num_params)["description"]
+ + " (sharded or offloaded)"
+ ),
+ },
+ ]
+
+ elif engine_type == "fsdp":
+ sharding_strategy = (
+ request.engine.get("sharding_strategy", "full_shard")
+ if request.engine
+ else "full_shard"
+ )
+
+ if sharding_strategy == "no_shard":
+ strategy_name = "No Sharding (like DDP)"
+ elif sharding_strategy == "shard_grad_op":
+ strategy_name = "Shard Gradients + Optimizer (like ZeRO-2)"
+ else:
+ strategy_name = "Full Shard (like ZeRO-3)"
+
+ formula_info["sharding_strategy"] = sharding_strategy
+ formula_info["strategy_name"] = strategy_name
+ formula_info["formula_description"] = f"FSDP with {strategy_name.lower()} strategy."
+
+ elif engine_type == "megatron_lm":
+ formula_info["formula_description"] = (
+ "Megatron-LM uses tensor and/or pipeline parallelism to "
+ "split the model across GPUs, reducing memory per GPU."
+ )
+
+ # Add parallelism info
+ if request.parallelism:
+ tp_size = request.parallelism.get("tensor_parallel_size", 1)
+ pp_size = request.parallelism.get("pipeline_parallel_size", 1)
+ formula_info["parallelism"] = {
+ "tensor_parallel_size": tp_size,
+ "pipeline_parallel_size": pp_size,
+ }
+
+ # Add activation memory explanation
+ components: list[dict[str, Any]] = formula_info["formula_components"] # type: ignore[assignment]
+ components.append(
+ {
+ "name": "Activations",
+ "formula": (
+ f"batch_size({batch_size}) ร seq_len ร hidden_size ร "
+ f"layers ร ~16 bytes/token/layer"
+ ),
+ "result": f"{result.breakdown.activations_gb:.2f} GB",
+ "description": "Memory from intermediate activations during forward/backward pass",
+ }
+ )
+
+ return formula_info
+
+ except Exception as e:
+ logger.error(f"Formula explanation error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to generate formula explanation: {str(e)}"
+ ) from e
+
+
+def _get_engine_name(engine_type: str) -> str:
+ """Get human-readable engine name."""
+ names = {
+ "pytorch_ddp": "PyTorch DDP (Distributed Data Parallel)",
+ "deepspeed": "DeepSpeed ZeRO",
+ "megatron_lm": "Megatron-LM",
+ "fsdp": "PyTorch FSDP (Fully Sharded Data Parallel)",
+ "megatron_deepspeed": "Megatron-LM + DeepSpeed",
+ }
+ return names.get(engine_type, engine_type)
+
+
+def _get_optimizer_formula(optimizer: str, num_params: int) -> dict[str, str]:
+ """Get optimizer memory formula based on optimizer type.
+
+ Args:
+ optimizer: Optimizer type (adam, adamw, sgd, adamw_8bit)
+ num_params: Number of model parameters
+
+ Returns:
+ Dictionary with 'formula' and 'description' keys
+ """
+ num_params_formatted = f"{num_params:,}"
+
+ if optimizer in ["adam", "adamw"]:
+ return {
+ "formula": f"{num_params_formatted} ร 12 bytes (Adam/AdamW FP32)",
+ "description": "4 bytes FP32 params + 4 bytes momentum + 4 bytes variance",
+ }
+ elif optimizer == "adamw_8bit":
+ return {
+ "formula": f"{num_params_formatted} ร 2 bytes (AdamW 8-bit)",
+ "description": "8-bit quantized optimizer states (2 bytes per parameter)",
+ }
+ elif optimizer == "sgd":
+ return {
+ "formula": f"{num_params_formatted} ร 4 bytes (SGD)",
+ "description": "4 bytes FP32 params (no momentum for SGD)",
+ }
+ else:
+ # Default to AdamW
+ return {
+ "formula": f"{num_params_formatted} ร 12 bytes (Adam/AdamW FP32)",
+ "description": "4 bytes FP32 params + 4 bytes momentum + 4 bytes variance",
+ }
+
+
+def _get_formula_references(engine_type: str) -> list[dict[str, str]]:
+ """Get authoritative references for the formula."""
+ references = [
+ {
+ "title": "EleutherAI Transformer Math 101",
+ "url": "https://blog.eleuther.ai/transformer-math/",
+ "description": "Comprehensive transformer memory breakdown with formulas",
+ },
+ {
+ "title": "Microsoft Research ZeRO Blog",
+ "url": "https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/",
+ "description": "ZeRO optimization techniques and memory formulas",
+ },
+ ]
+
+ if engine_type in ["deepspeed", "megatron_deepspeed"]:
+ references.append(
+ {
+ "title": "DeepSpeed Memory Documentation",
+ "url": "https://deepspeed.readthedocs.io/en/latest/memory.html",
+ "description": "Official DeepSpeed memory requirements and formulas",
+ }
+ )
+ elif engine_type == "megatron_lm" or engine_type == "megatron_deepspeed":
+ references.append(
+ {
+ "title": "NVIDIA Megatron-LM",
+ "url": "https://github.com/NVIDIA/Megatron-LM",
+ "description": "Megatron-LM tensor and pipeline parallelism",
+ }
+ )
+ elif engine_type == "fsdp":
+ references.append(
+ {
+ "title": "PyTorch FSDP Documentation",
+ "url": "https://pytorch.org/docs/stable/fsdp.html",
+ "description": "PyTorch Fully Sharded Data Parallel documentation",
+ }
+ )
+
+ return references
+
+
+@app.post("/api/inference/calculate")
+async def calculate_inference_memory(request: dict[str, Any]) -> dict[str, Any]:
+ """Calculate GPU memory requirements for inference.
+
+ Args:
+ request: Dictionary with model, inference, and hardware configs
+
+ Returns:
+ Inference memory result with breakdown
+ """
+ try:
+ model_data = request.get("model", {})
+ inference_data = request.get("inference", {})
+ hardware_data = request.get("hardware", {})
+
+ # Parse num_parameters if it's a string
+ if "num_parameters" in model_data and isinstance(model_data["num_parameters"], str):
+ from gpu_mem_calculator.config.parser import ConfigParser
+
+ model_data["num_parameters"] = ConfigParser._parse_num_params(
+ model_data["num_parameters"]
+ )
+
+ # Create model config
+ model_config = ModelConfig(**model_data)
+
+ # Create inference config
+ kv_cache_quantization = inference_data.get("kv_cache_quantization", "none")
+ if isinstance(kv_cache_quantization, str):
+ from gpu_mem_calculator.core.models import KVCacheQuantization
+
+ kv_cache_quantization = KVCacheQuantization(kv_cache_quantization)
+
+ inference_config = InferenceConfig(
+ batch_size=inference_data.get("batch_size", 1),
+ kv_cache_quantization=kv_cache_quantization,
+ use_kv_cache=inference_data.get("use_kv_cache", True),
+ tensor_parallel_size=inference_data.get("tensor_parallel_size", 1),
+ gpu_memory_utilization=inference_data.get("gpu_memory_utilization", 0.9),
+ enable_streaming=inference_data.get("enable_streaming", False),
+ # TGI-specific parameters
+ max_total_tokens=inference_data.get("max_total_tokens"),
+ max_input_tokens=inference_data.get("max_input_tokens"),
+ max_batch_total_tokens=inference_data.get("max_batch_total_tokens"),
+ tgi_quantize=inference_data.get("tgi_quantize", "none"),
+ tgi_dtype=inference_data.get("tgi_dtype", "bfloat16"),
+ sharded=inference_data.get("sharded", False),
+ num_shard=inference_data.get("num_shard"),
+ # vLLM-specific parameters
+ block_size=inference_data.get("block_size"),
+ swap_space_gb=inference_data.get("swap_space_gb", 0.0),
+ enable_prefix_caching=inference_data.get("enable_prefix_caching", False),
+ enforce_eager=inference_data.get("enforce_eager", False),
+ max_num_batched_tokens=inference_data.get("max_num_batched_tokens"),
+ max_num_seqs=inference_data.get("max_num_seqs"),
+ vllm_quantization=inference_data.get("vllm_quantization", "none"),
+ # TensorRT-LLM-specific parameters
+ trt_max_batch_size=inference_data.get("trt_max_batch_size"),
+ trt_max_input_len=inference_data.get("trt_max_input_len"),
+ trt_max_seq_len=inference_data.get("trt_max_seq_len"),
+ trt_max_beam_width=inference_data.get("trt_max_beam_width"),
+ # SGLang-specific parameters
+ chunk_size=inference_data.get("chunk_size"),
+ max_running_requests=inference_data.get("max_running_requests"),
+ disable_radix_cache=inference_data.get("disable_radix_cache", False),
+ enable_p2p=inference_data.get("enable_p2p", False),
+ disable_custom_all_reduce=inference_data.get("disable_custom_all_reduce", False),
+ attention_backend=inference_data.get("attention_backend", "flashinfer"),
+ enable_torch_compile=inference_data.get("enable_torch_compile", False),
+ radix_cache_max_seq_len=inference_data.get("radix_cache_max_seq_len"),
+ speculative_algo=inference_data.get("speculative_algo", "default"),
+ multi_lora_enabled=inference_data.get("multi_lora_enabled", False),
+ )
+
+ # Create GPU config
+ gpu_config = GPUConfig(
+ num_gpus=hardware_data.get("num_gpus", 1),
+ gpu_memory_gb=hardware_data.get("gpu_memory_gb", 80),
+ )
+
+ # Get engine type
+ engine_type_str = inference_data.get("engine_type", "huggingface")
+ engine_type_map = {
+ "huggingface": InferenceEngineType.HUGGINGFACE,
+ "vllm": InferenceEngineType.VLLM,
+ "tgi": InferenceEngineType.TGI,
+ "tensorrt_llm": InferenceEngineType.TENSORRT_LLM,
+ "sglang": InferenceEngineType.SGLANG,
+ }
+ engine_type = engine_type_map.get(engine_type_str, InferenceEngineType.HUGGINGFACE)
+
+ # Calculate inference memory
+ calculator = InferenceMemoryCalculator(model_config, inference_config, gpu_config)
+ result = calculator.calculate(engine_type)
+
+ return {
+ "total_memory_per_gpu_gb": result.total_memory_per_gpu_gb,
+ "total_memory_all_gpus_gb": result.total_memory_all_gpus_gb,
+ "breakdown": {
+ "model_params_gb": result.breakdown.model_params_gb,
+ "kv_cache_gb": result.breakdown.kv_cache_gb,
+ "activations_gb": result.breakdown.activations_gb,
+ "overhead_gb": result.breakdown.overhead_gb,
+ },
+ "max_supported_batch_size": result.max_supported_batch_size,
+ "estimated_throughput_tokens_per_sec": result.estimated_throughput_tokens_per_sec,
+ "fits_on_gpu": result.fits_on_gpu,
+ "memory_utilization_percent": result.memory_utilization_percent,
+ }
+
+ except Exception as e:
+ logger.error(f"Inference calculation error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to calculate inference memory: {str(e)}"
+ ) from e
+
+
+@app.post("/api/multinode/calculate")
+async def calculate_multinode(request: dict[str, Any]) -> dict[str, Any]:
+ """Calculate network overhead for multi-node training.
+
+ Args:
+ request: Dictionary with model, training, parallelism, engine, and node configs
+
+ Returns:
+ Network overhead result with suggestions
+ """
+ try:
+ model_data = request.get("model", {})
+ training_data = request.get("training", {})
+ parallelism_data = request.get("parallelism", {})
+ engine_data = request.get("engine", {})
+ node_data = request.get("node_config", {})
+
+ # Parse num_parameters if it's a string
+ if "num_parameters" in model_data and isinstance(model_data["num_parameters"], str):
+ from gpu_mem_calculator.config.parser import ConfigParser
+
+ model_data["num_parameters"] = ConfigParser._parse_num_params(
+ model_data["num_parameters"]
+ )
+
+ # Create minimal configs for multi-node calculation
+ model_config = ModelConfig(
+ name="multinode-model",
+ num_parameters=model_data.get("num_parameters", 7_000_000_000),
+ num_layers=32,
+ hidden_size=4096,
+ num_attention_heads=32,
+ )
+
+ training_config = TrainingConfig(
+ dtype=training_data.get("dtype", "bf16"),
+ batch_size=training_data.get("batch_size", 4),
+ )
+
+ parallelism_config = ParallelismConfig(
+ tensor_parallel_size=parallelism_data.get("tensor_parallel_size", 1),
+ pipeline_parallel_size=parallelism_data.get("pipeline_parallel_size", 1),
+ sequence_parallel=parallelism_data.get("sequence_parallel", False),
+ )
+
+ engine_config = EngineConfig(
+ type=engine_data.get("type", "deepspeed"),
+ zero_stage=engine_data.get("zero_stage", 3),
+ )
+
+ interconnect_type_str = node_data.get("interconnect_type", "infiniband")
+ interconnect_map = {
+ "infiniband": InterconnectType.INFINIBAND,
+ "nvlink": InterconnectType.NVLINK,
+ "ethernet_200g": InterconnectType.ETHERNET_200G,
+ "ethernet_100g": InterconnectType.ETHERNET_100G,
+ "ethernet_25g": InterconnectType.ETHERNET_25G,
+ "ethernet_10g": InterconnectType.ETHERNET_10G,
+ }
+ interconnect_type = interconnect_map.get(interconnect_type_str, InterconnectType.INFINIBAND)
+
+ node_config = NodeConfig(
+ num_nodes=node_data.get("num_nodes", 2),
+ gpus_per_node=node_data.get("gpus_per_node", 8),
+ interconnect_type=interconnect_type,
+ )
+
+ # Calculate network overhead
+ calculator = MultiNodeCalculator(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ node_config=node_config,
+ engine_config=engine_config,
+ )
+
+ overhead = calculator.calculate_network_overhead()
+
+ # Generate optimization suggestions
+ suggestions: list[str] = []
+ if overhead.total_overhead_gb > 10:
+ suggestions.append("Consider reducing tensor parallelism to lower AllGather overhead")
+ if overhead.estimated_overhead_ms_per_step and overhead.estimated_overhead_ms_per_step > 50:
+ overhead_val = overhead.estimated_overhead_ms_per_step
+ suggestions.append(
+ f"High communication overhead ({overhead_val:.1f}ms/step). "
+ "Consider upgrading interconnect or reducing model size."
+ )
+ if interconnect_type_str.startswith("ethernet") and node_config.num_nodes > 2:
+ suggestions.append(
+ "Ethernet interconnect detected. For multi-node training, "
+ "consider InfiniBand for better performance."
+ )
+
+ return {
+ "network_overhead": {
+ "total_overhead_gb": overhead.total_overhead_gb,
+ "allreduce_gb": overhead.allreduce_gb,
+ "allgather_gb": overhead.allgather_gb,
+ "reducescatter_gb": overhead.reducescatter_gb,
+ "pipeline_gb": overhead.point_to_point_gb,
+ "estimated_overhead_ms_per_step": overhead.estimated_overhead_ms_per_step,
+ "communication_time_ms_per_step": None,
+ "latency_overhead_ms": None,
+ },
+ "suggestions": suggestions,
+ }
+
+ except Exception as e:
+ logger.error(f"Multi-node calculation error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to calculate multi-node overhead: {str(e)}"
+ ) from e
+
+
+@app.post("/api/export/{format}")
+async def export_framework_config(format: str, request: CalculateRequest) -> dict[str, Any]:
+ """Export configuration to framework-specific format.
+
+ Args:
+ format: Export format (accelerate, lightning, axolotl, deepspeed, yaml, json)
+ request: Calculation request with all configurations
+
+ Returns:
+ Exported configuration file content
+ """
+ try:
+ # Parse configurations
+ model_data = request.model.copy()
+ if "num_parameters" in model_data and isinstance(model_data["num_parameters"], str):
+ from gpu_mem_calculator.config.parser import ConfigParser
+
+ model_data["num_parameters"] = ConfigParser._parse_num_params(
+ model_data["num_parameters"]
+ )
+
+ model_config = ModelConfig(**model_data)
+ training_config = TrainingConfig(**request.training)
+ parallelism_config = (
+ ParallelismConfig(**request.parallelism) if request.parallelism else ParallelismConfig()
+ )
+ engine_config = EngineConfig(**request.engine) if request.engine else EngineConfig()
+
+ # Create minimal node config (not used for single-node export)
+ node_config = NodeConfig(num_nodes=1, gpus_per_node=8)
+
+ # Map format string to ExportFormat enum
+ format_map = {
+ "accelerate": ExportFormat.ACCELERATE,
+ "lightning": ExportFormat.LIGHTNING,
+ "axolotl": ExportFormat.AXOLOTL,
+ "deepspeed": ExportFormat.DEEPSPEED,
+ "yaml": ExportFormat.YAML,
+ "json": ExportFormat.JSON,
+ }
+
+ export_format = format_map.get(format.lower())
+ if not export_format:
+ raise HTTPException(
+ status_code=400,
+ detail=f"Unsupported export format: {format}. Supported: {list(format_map.keys())}",
+ )
+
+ # Export configuration
+ manager = ExportManager(
+ model_config=model_config,
+ training_config=training_config,
+ parallelism_config=parallelism_config,
+ engine_config=engine_config,
+ node_config=node_config,
+ )
+
+ result = manager.export(export_format)
+
+ # Generate filename
+ if isinstance(result, dict):
+ filename = f"config_{format}.{result.get('extension', 'txt')}"
+ else:
+ filename = f"config.{format}"
+
+ return {
+ "format": format,
+ "content": result,
+ "filename": filename,
+ }
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Export error ({format}): {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=500, detail=f"Failed to export {format} config: {str(e)}"
+ ) from e
+
+
+def main() -> None:
+ """Run the development server."""
+ import uvicorn
+
+ uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/web/presets/models.json b/web/presets/models.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d56fb187bb3aa73fb5be52f63eceeb88f2995a
--- /dev/null
+++ b/web/presets/models.json
@@ -0,0 +1,407 @@
+{
+ "llama2-7b": {
+ "display_name": "LLaMA 2 7B",
+ "description": "Meta LLaMA 2 7B model",
+ "config": {
+ "model": {
+ "name": "llama2-7b",
+ "num_parameters": "7B",
+ "num_layers": 32,
+ "hidden_size": 4096,
+ "num_attention_heads": 32,
+ "vocab_size": 32000,
+ "max_seq_len": 4096
+ },
+ "training": {
+ "batch_size": 4,
+ "gradient_accumulation_steps": 4,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 1
+ },
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 8,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 8,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "llama2-13b": {
+ "display_name": "LLaMA 2 13B",
+ "description": "Meta LLaMA 2 13B model",
+ "config": {
+ "model": {
+ "name": "llama2-13b",
+ "num_parameters": "13B",
+ "num_layers": 40,
+ "hidden_size": 5120,
+ "num_attention_heads": 40,
+ "vocab_size": 32000,
+ "max_seq_len": 4096
+ },
+ "training": {
+ "batch_size": 2,
+ "gradient_accumulation_steps": 8,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 1
+ },
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 8,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 8,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "llama2-70b": {
+ "display_name": "LLaMA 2 70B",
+ "description": "Meta LLaMA 2 70B model",
+ "config": {
+ "model": {
+ "name": "llama2-70b",
+ "num_parameters": "70B",
+ "num_layers": 80,
+ "hidden_size": 8192,
+ "num_attention_heads": 64,
+ "vocab_size": 32000,
+ "max_seq_len": 4096
+ },
+ "training": {
+ "batch_size": 1,
+ "gradient_accumulation_steps": 16,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 4,
+ "pipeline_parallel_size": 2,
+ "data_parallel_size": 8,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 64,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "gpt3-175b": {
+ "display_name": "GPT-3 175B",
+ "description": "OpenAI GPT-3 175B model",
+ "config": {
+ "model": {
+ "name": "gpt3-175b",
+ "num_parameters": "175B",
+ "num_layers": 96,
+ "hidden_size": 12288,
+ "num_attention_heads": 96,
+ "vocab_size": 50257,
+ "max_seq_len": 2048
+ },
+ "training": {
+ "batch_size": 1,
+ "gradient_accumulation_steps": 1,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 8,
+ "pipeline_parallel_size": 16,
+ "data_parallel_size": 1,
+ "sequence_parallel": true
+ },
+ "engine": {
+ "type": "megatron_lm"
+ },
+ "hardware": {
+ "num_gpus": 1024,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "mixtral-8x7b": {
+ "display_name": "Mixtral 8x7B (MoE)",
+ "description": "Mistral AI Mixtral 8x7B - 46.7B total params, ~12.9B active per token",
+ "config": {
+ "model": {
+ "name": "mixtral-8x7b",
+ "num_parameters": "46.7B",
+ "num_layers": 32,
+ "hidden_size": 4096,
+ "num_attention_heads": 32,
+ "vocab_size": 32000,
+ "max_seq_len": 32768,
+ "moe_enabled": true,
+ "num_experts": 8,
+ "top_k": 2,
+ "expert_intermediate_size": 14336
+ },
+ "training": {
+ "batch_size": 2,
+ "gradient_accumulation_steps": 4,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 4,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 8,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "glm-4-9b": {
+ "display_name": "GLM-4 9B (MoE)",
+ "description": "Tsinghua University GLM-4 9B with MoE architecture",
+ "config": {
+ "model": {
+ "name": "glm-4-9b",
+ "num_parameters": "9B",
+ "num_layers": 40,
+ "hidden_size": 4096,
+ "num_attention_heads": 32,
+ "vocab_size": 151552,
+ "max_seq_len": 8192,
+ "moe_enabled": true,
+ "num_experts": 4,
+ "top_k": 2,
+ "expert_intermediate_size": 10240,
+ "shared_expert_intermediate_size": 10240
+ },
+ "training": {
+ "batch_size": 4,
+ "gradient_accumulation_steps": 4,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 4,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 2,
+ "offload_optimizer": "none",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 4,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "glm-4.7-355b": {
+ "display_name": "GLM-4.7 355B (MoE)",
+ "description": "Tsinghua University GLM-4.7 - Latest flagship with 355B total / 32B active params",
+ "config": {
+ "model": {
+ "name": "glm-4.7-355b",
+ "num_parameters": "355B",
+ "num_layers": 46,
+ "hidden_size": 4096,
+ "num_attention_heads": 96,
+ "vocab_size": 151552,
+ "max_seq_len": 131072,
+ "moe_enabled": true,
+ "num_experts": 128,
+ "top_k": 8,
+ "expert_intermediate_size": 1408,
+ "shared_expert_intermediate_size": 10944
+ },
+ "training": {
+ "batch_size": 1,
+ "gradient_accumulation_steps": 16,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 4
+ },
+ "parallelism": {
+ "tensor_parallel_size": 8,
+ "pipeline_parallel_size": 4,
+ "data_parallel_size": 16,
+ "sequence_parallel": true
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "cpu"
+ },
+ "hardware": {
+ "num_gpus": 512,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "glm-4.5-air-106b": {
+ "display_name": "GLM-4.5 Air 106B (MoE) โญ Air",
+ "description": "Tsinghua University GLM-4.5 Air - 106B total / 12B active params, optimized for deployment",
+ "config": {
+ "model": {
+ "name": "glm-4.5-air-106b",
+ "num_parameters": "106B",
+ "num_layers": 46,
+ "hidden_size": 4096,
+ "num_attention_heads": 96,
+ "vocab_size": 151552,
+ "max_seq_len": 131072,
+ "moe_enabled": true,
+ "num_experts": 128,
+ "top_k": 8,
+ "expert_intermediate_size": 1408,
+ "shared_expert_intermediate_size": 10944
+ },
+ "training": {
+ "batch_size": 2,
+ "gradient_accumulation_steps": 8,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 4,
+ "pipeline_parallel_size": 2,
+ "data_parallel_size": 8,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 64,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "qwen1.5-moe-a2.7b": {
+ "display_name": "Qwen1.5-MoE-A2.7B",
+ "description": "Alibaba Qwen1.5 MoE - 14B total params, 2.7B active per token",
+ "config": {
+ "model": {
+ "name": "qwen1.5-moe-a2.7b",
+ "num_parameters": "14B",
+ "num_layers": 28,
+ "hidden_size": 5120,
+ "num_attention_heads": 40,
+ "vocab_size": 151936,
+ "max_seq_len": 32768,
+ "moe_enabled": true,
+ "num_experts": 8,
+ "top_k": 4,
+ "expert_intermediate_size": 15360
+ },
+ "training": {
+ "batch_size": 2,
+ "gradient_accumulation_steps": 4,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 4,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 3,
+ "offload_optimizer": "cpu",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 8,
+ "gpu_memory_gb": 80
+ }
+ }
+ },
+ "deepseek-moe-16b": {
+ "display_name": "DeepSeek-MoE 16B",
+ "description": "DeepSeek MoE model with 16.4B total params, ~2.7B active per token",
+ "config": {
+ "model": {
+ "name": "deepseek-moe-16b",
+ "num_parameters": "16.4B",
+ "num_layers": 28,
+ "hidden_size": 2048,
+ "num_attention_heads": 16,
+ "vocab_size": 102400,
+ "max_seq_len": 4096,
+ "moe_enabled": true,
+ "num_experts": 64,
+ "top_k": 6,
+ "expert_intermediate_size": 1408,
+ "shared_expert_intermediate_size": 10944
+ },
+ "training": {
+ "batch_size": 4,
+ "gradient_accumulation_steps": 4,
+ "optimizer": "adamw",
+ "dtype": "bf16",
+ "activation_checkpointing": 2
+ },
+ "parallelism": {
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 1,
+ "data_parallel_size": 4,
+ "sequence_parallel": false
+ },
+ "engine": {
+ "type": "deepspeed",
+ "zero_stage": 2,
+ "offload_optimizer": "none",
+ "offload_param": "none"
+ },
+ "hardware": {
+ "num_gpus": 8,
+ "gpu_memory_gb": 80
+ }
+ }
+ }
+}
diff --git a/web/static/css/styles.css b/web/static/css/styles.css
new file mode 100644
index 0000000000000000000000000000000000000000..40fae68f04ffc72a61c0c38647e6c3e67e5d12d8
--- /dev/null
+++ b/web/static/css/styles.css
@@ -0,0 +1,532 @@
+/* GPU Memory Calculator Styles */
+
+:root {
+ --primary-color: #2563eb;
+ --primary-hover: #1d4ed8;
+ --success-color: #10b981;
+ --warning-color: #f59e0b;
+ --danger-color: #ef4444;
+ --bg-color: #f8fafc;
+ --card-bg: #ffffff;
+ --border-color: #e2e8f0;
+ --text-primary: #1e293b;
+ --text-secondary: #64748b;
+ --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1);
+}
+
+* {
+ box-sizing: border-box;
+ margin: 0;
+ padding: 0;
+}
+
+body {
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ background-color: var(--bg-color);
+ color: var(--text-primary);
+ line-height: 1.6;
+}
+
+.container {
+ max-width: 1400px;
+ margin: 0 auto;
+ padding: 20px;
+}
+
+header {
+ text-align: center;
+ margin-bottom: 30px;
+}
+
+header h1 {
+ font-size: 2.5rem;
+ color: var(--text-primary);
+ margin-bottom: 5px;
+}
+
+.subtitle {
+ color: var(--text-secondary);
+ font-size: 1.1rem;
+}
+
+/* Tab Navigation */
+.tab-navigation {
+ display: flex;
+ gap: 10px;
+ margin-bottom: 30px;
+ justify-content: center;
+ background: var(--card-bg);
+ padding: 10px;
+ border-radius: 8px;
+ box-shadow: var(--shadow);
+}
+
+.tab-btn {
+ padding: 12px 24px;
+ border: 2px solid var(--border-color);
+ background: var(--card-bg);
+ color: var(--text-secondary);
+ border-radius: 6px;
+ cursor: pointer;
+ font-size: 1rem;
+ font-weight: 500;
+ transition: all 0.2s ease;
+ flex: 1;
+ max-width: 200px;
+}
+
+.tab-btn:hover {
+ background: var(--bg-color);
+ border-color: var(--primary-color);
+ color: var(--primary-color);
+}
+
+.tab-btn.active {
+ background: var(--primary-color);
+ color: white;
+ border-color: var(--primary-color);
+}
+
+.tab-content {
+ display: none;
+}
+
+.tab-content.active {
+ display: grid;
+ grid-template-columns: 1fr 400px;
+ gap: 20px;
+ align-items: start;
+}
+
+.main-content {
+ display: contents;
+}
+
+@media (max-width: 1024px) {
+ .tab-content.active {
+ grid-template-columns: 1fr;
+ }
+}
+
+/* Config Panel */
+.config-panel {
+ background: var(--card-bg);
+ border-radius: 8px;
+ padding: 20px;
+ box-shadow: var(--shadow);
+}
+
+.config-section {
+ margin-bottom: 25px;
+ padding-bottom: 20px;
+ border-bottom: 1px solid var(--border-color);
+}
+
+.config-section:last-of-type {
+ border-bottom: none;
+}
+
+.config-section h3 {
+ font-size: 1.2rem;
+ margin-bottom: 15px;
+ color: var(--text-primary);
+}
+
+.form-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+ gap: 15px;
+}
+
+.form-group {
+ display: flex;
+ flex-direction: column;
+ position: relative;
+}
+
+.form-group label {
+ font-size: 0.9rem;
+ font-weight: 500;
+ margin-bottom: 5px;
+ color: var(--text-primary);
+}
+
+.form-group input[type="text"],
+.form-group input[type="number"],
+.form-group select {
+ padding: 8px 12px;
+ border: 1px solid var(--border-color);
+ border-radius: 4px;
+ font-size: 0.95rem;
+ transition: border-color 0.2s;
+}
+
+.form-group input[type="text"]:focus,
+.form-group input[type="number"]:focus,
+.form-group select:focus {
+ outline: none;
+ border-color: var(--primary-color);
+}
+
+.form-group input[type="range"] {
+ margin-top: 5px;
+ width: 100%;
+}
+
+.form-group input[type="checkbox"] {
+ margin-right: 8px;
+}
+
+.info-text {
+ font-size: 0.85rem;
+ color: var(--text-secondary);
+ margin-top: 10px;
+}
+
+/* Buttons */
+.button-group {
+ display: flex;
+ gap: 10px;
+ margin-top: 20px;
+}
+
+.btn-primary {
+ background-color: var(--primary-color);
+ color: white;
+ border: none;
+ padding: 12px 24px;
+ border-radius: 6px;
+ font-size: 1rem;
+ font-weight: 600;
+ cursor: pointer;
+ transition: background-color 0.2s;
+}
+
+.btn-primary:hover {
+ background-color: var(--primary-hover);
+}
+
+.btn-secondary {
+ background-color: white;
+ color: var(--text-primary);
+ border: 1px solid var(--border-color);
+ padding: 12px 24px;
+ border-radius: 6px;
+ font-size: 1rem;
+ font-weight: 500;
+ cursor: pointer;
+ transition: all 0.2s;
+}
+
+.btn-secondary:hover {
+ background-color: var(--bg-color);
+ border-color: var(--text-secondary);
+}
+
+/* Results Panel */
+.results-panel {
+ background: var(--card-bg);
+ border-radius: 8px;
+ padding: 20px;
+ box-shadow: var(--shadow);
+ position: sticky;
+ top: 20px;
+}
+
+.result-card {
+ margin-bottom: 20px;
+ padding-bottom: 15px;
+ border-bottom: 1px solid var(--border-color);
+}
+
+.result-card:last-child {
+ border-bottom: none;
+}
+
+.result-card h3 {
+ font-size: 1.1rem;
+ margin-bottom: 12px;
+ color: var(--text-primary);
+}
+
+.metric {
+ display: flex;
+ justify-content: space-between;
+ margin-bottom: 8px;
+}
+
+.metric-label {
+ color: var(--text-secondary);
+ font-size: 0.95rem;
+}
+
+.metric-value {
+ font-weight: 600;
+ color: var(--text-primary);
+ font-size: 1rem;
+}
+
+.breakdown-item {
+ display: flex;
+ justify-content: space-between;
+ margin-bottom: 6px;
+ font-size: 0.9rem;
+}
+
+.breakdown-label {
+ color: var(--text-secondary);
+}
+
+.breakdown-value {
+ font-weight: 500;
+ color: var(--text-primary);
+}
+
+/* Bar Chart */
+.bar-chart {
+ display: flex;
+ height: 24px;
+ border-radius: 4px;
+ overflow: hidden;
+ margin-top: 15px;
+ background-color: var(--border-color);
+}
+
+.bar {
+ height: 100%;
+ transition: width 0.3s ease;
+ position: relative;
+}
+
+/* Add patterns to bars for colorblind accessibility */
+#bar-params {
+ background-color: #3b82f6;
+ background-image: repeating-linear-gradient(
+ 45deg,
+ transparent,
+ transparent 5px,
+ rgba(255, 255, 255, 0.1) 5px,
+ rgba(255, 255, 255, 0.1) 10px
+ );
+}
+
+#bar-grads {
+ background-color: #8b5cf6;
+ background-image: repeating-linear-gradient(
+ 45deg,
+ transparent,
+ transparent 5px,
+ rgba(255, 255, 255, 0.1) 5px,
+ rgba(255, 255, 255, 0.1) 10px
+ );
+ /* Different pattern: dots */
+ background-image: radial-gradient(circle, rgba(255,255,255,0.2) 1px, transparent 1px);
+ background-size: 8px 8px;
+}
+
+#bar-optimizer {
+ background-color: #ec4899;
+ background-image: repeating-linear-gradient(
+ -45deg,
+ transparent,
+ transparent 5px,
+ rgba(255, 255, 255, 0.15) 5px,
+ rgba(255, 255, 255, 0.15) 10px
+ );
+}
+
+#bar-activations {
+ background-color: #10b981;
+ background-image: repeating-linear-gradient(
+ 90deg,
+ transparent,
+ transparent 5px,
+ rgba(255, 255, 255, 0.1) 5px,
+ rgba(255, 255, 255, 0.1) 10px
+ );
+}
+
+.bar:first-child {
+ border-top-left-radius: 4px;
+ border-bottom-left-radius: 4px;
+}
+
+.bar:last-child {
+ border-top-right-radius: 4px;
+ border-bottom-right-radius: 4px;
+}
+
+.chart-legend {
+ display: flex;
+ gap: 15px;
+ margin-top: 10px;
+ font-size: 0.8rem;
+}
+
+.legend-item {
+ display: flex;
+ align-items: center;
+ gap: 5px;
+}
+
+.legend-color {
+ width: 12px;
+ height: 12px;
+ border-radius: 2px;
+}
+
+.legend-color.params { background-color: #3b82f6; }
+.legend-color.grads { background-color: #8b5cf6; }
+.legend-color.optimizer { background-color: #ec4899; }
+.legend-color.activations { background-color: #10b981; }
+
+#bar-params { background-color: #3b82f6; }
+#bar-grads { background-color: #8b5cf6; }
+#bar-optimizer { background-color: #ec4899; }
+#bar-activations { background-color: #10b981; }
+
+/* Screen reader only class */
+.sr-only {
+ position: absolute;
+ width: 1px;
+ height: 1px;
+ padding: 0;
+ margin: -1px;
+ overflow: hidden;
+ clip: rect(0, 0, 0, 0);
+ white-space: nowrap;
+ border-width: 0;
+}
+
+/* Status colors */
+.status-success { color: var(--success-color); }
+.status-warning { color: var(--warning-color); }
+.status-danger { color: var(--danger-color); }
+
+/* Group Label */
+.group-label {
+ font-size: 0.95rem;
+ font-weight: 600;
+ color: var(--text-primary);
+ margin-bottom: 5px;
+ display: block;
+}
+
+/* Error Message */
+.error-message {
+ position: fixed;
+ bottom: 20px;
+ right: 20px;
+ background-color: var(--danger-color);
+ color: white;
+ padding: 15px 20px;
+ border-radius: 6px;
+ box-shadow: var(--shadow);
+ z-index: 1000;
+}
+
+/* Tooltip */
+[data-tooltip] {
+ position: relative;
+}
+
+[data-tooltip]:hover::after {
+ content: attr(data-tooltip);
+ position: absolute;
+ bottom: 100%;
+ left: 50%;
+ transform: translateX(-50%);
+ padding: 5px 10px;
+ background-color: var(--text-primary);
+ color: white;
+ font-size: 0.8rem;
+ border-radius: 4px;
+ white-space: nowrap;
+ z-index: 100;
+ margin-bottom: 5px;
+}
+
+/* Formula Explanation Section */
+.formula-description {
+ margin-bottom: 15px;
+ line-height: 1.6;
+}
+
+.formula-description p {
+ margin-bottom: 8px;
+}
+
+.formula-components-list {
+ list-style: none;
+ padding: 0;
+ margin: 15px 0;
+}
+
+.formula-components-list li {
+ background-color: var(--bg-color);
+ border: 1px solid var(--border-color);
+ border-radius: 6px;
+ padding: 12px;
+ margin-bottom: 12px;
+}
+
+.component-name {
+ font-weight: 600;
+ color: var(--text-primary);
+ margin-bottom: 6px;
+ font-size: 1rem;
+}
+
+.component-formula {
+ font-family: 'Courier New', Courier, monospace;
+ background-color: var(--border-color);
+ padding: 8px;
+ border-radius: 4px;
+ margin: 8px 0;
+ font-size: 0.9rem;
+ overflow-x: auto;
+}
+
+.component-calculation {
+ margin: 6px 0;
+ font-size: 0.9rem;
+ color: var(--text-secondary);
+}
+
+.component-result {
+ margin-top: 6px;
+ font-size: 0.95rem;
+ color: var(--primary-color);
+ font-weight: 500;
+}
+
+.formula-references {
+ margin-top: 20px;
+ padding-top: 15px;
+ border-top: 1px solid var(--border-color);
+}
+
+.formula-references h4 {
+ font-size: 1rem;
+ color: var(--text-primary);
+ margin-bottom: 10px;
+}
+
+.formula-references ul {
+ list-style-type: none;
+ padding: 0;
+}
+
+.formula-references li {
+ margin-bottom: 8px;
+}
+
+.formula-references a {
+ color: var(--primary-color);
+ text-decoration: none;
+ font-size: 0.9rem;
+}
+
+.formula-references a:hover {
+ text-decoration: underline;
+}
diff --git a/web/static/js/app.js b/web/static/js/app.js
new file mode 100644
index 0000000000000000000000000000000000000000..eff9e5eaad9bc71a8271daf94079269897b6fc5b
--- /dev/null
+++ b/web/static/js/app.js
@@ -0,0 +1,1404 @@
+// GPU Memory Calculator - Main Application Logic
+
+class GPUMemCalculator {
+ constructor() {
+ this.apiBase = '/api';
+ this.autoCalculateEnabled = true;
+ this.debounceTimer = null;
+ this.debounceDelay = 1000; // ms - increased from 500 to reduce API calls
+ this.isApplyingConfig = false; // Flag to prevent auto-calc during preset loads
+ this.lastCalculationTime = 0; // Prevent too frequent calculations
+ this.minCalculationInterval = 500; // Minimum time between calculations (ms)
+ this.savedConfigs = []; // For comparison mode
+ this.initEventListeners();
+ this.initAutoCalculate();
+ this.initTabListeners();
+ this.loadSavedConfigs();
+ }
+
+ initEventListeners() {
+ // Tab navigation
+ document.querySelectorAll('.tab-btn').forEach(btn => {
+ btn.addEventListener('click', (e) => {
+ const tabName = e.target.dataset.tab;
+ this.switchTab(tabName);
+ });
+ });
+
+ // Preset selection
+ document.getElementById('preset-select').addEventListener('change', (e) => {
+ if (e.target.value !== 'custom') {
+ this.loadPreset(e.target.value);
+ }
+ });
+
+ // Batch size slider sync
+ const batchSizeInput = document.getElementById('batch-size');
+ const batchSizeSlider = document.getElementById('batch-size-slider');
+
+ batchSizeSlider.addEventListener('input', (e) => {
+ batchSizeInput.value = e.target.value;
+ });
+
+ batchSizeInput.addEventListener('input', (e) => {
+ batchSizeSlider.value = e.target.value;
+ });
+
+ // GPU memory dropdown
+ document.getElementById('gpu-model').addEventListener('change', (e) => {
+ const customInput = document.getElementById('gpu-mem-custom');
+ if (e.target.value === 'custom') {
+ customInput.style.display = 'block';
+ } else {
+ customInput.style.display = 'none';
+ customInput.value = e.target.value;
+ }
+ });
+
+ // Engine type change - update dynamic fields
+ document.getElementById('engine-type').addEventListener('change', (e) => {
+ this.updateEngineFields(e.target.value);
+ });
+
+ // Parallelism change - update effective GPUs
+ const parallelismInputs = ['tensor-pp', 'pipeline-pp', 'data-pp'];
+ parallelismInputs.forEach(id => {
+ document.getElementById(id).addEventListener('input', () => {
+ this.updateEffectiveGPUs();
+ });
+ });
+
+ // MoE checkbox - toggle visibility of MoE fields
+ document.getElementById('moe-enabled').addEventListener('change', (e) => {
+ this.toggleMoEFields(e.target.checked);
+ });
+
+ // MoE field changes - update display
+ ['num-experts', 'top-k'].forEach(id => {
+ document.getElementById(id).addEventListener('input', () => {
+ this.updateMoEDisplay();
+ });
+ });
+
+ // Calculate button
+ document.getElementById('calculate-btn').addEventListener('click', () => {
+ this.calculateMemory();
+ });
+
+ // Reset button
+ document.getElementById('reset-btn').addEventListener('click', () => {
+ this.resetForm();
+ });
+
+ // Save config button
+ document.getElementById('save-config-btn').addEventListener('click', () => {
+ this.saveConfig();
+ });
+
+ // Copy JSON button
+ document.getElementById('copy-json-btn').addEventListener('click', () => {
+ this.copyConfigJSON();
+ });
+
+ // Show formula details button - use toggle approach
+ document.getElementById('show-formula-btn').addEventListener('click', () => {
+ this.toggleFormulaExplanation();
+ });
+
+ // Initialize engine fields
+ this.updateEngineFields('deepspeed');
+ this.updateEffectiveGPUs();
+
+ // Store last config for formula explanation
+ this.lastConfig = null;
+ // Track if formula details are currently visible
+ this.formulaDetailsVisible = false;
+ }
+
+ initAutoCalculate() {
+ // List of all input IDs that should trigger auto-calculation
+ const autoCalcInputs = [
+ // Model settings
+ 'model-name', 'num-params', 'num-layers', 'hidden-size', 'num-heads',
+ 'vocab-size', 'seq-len',
+ // MoE settings
+ 'moe-enabled', 'num-experts', 'top-k', 'expert-intermediate-size', 'shared-expert-size',
+ // Training settings
+ 'batch-size', 'batch-size-slider', 'grad-accum', 'optimizer', 'dtype',
+ 'activation-checkpointing',
+ // Parallelism
+ 'tensor-pp', 'pipeline-pp', 'data-pp', 'seq-parallel',
+ // Engine settings
+ 'engine-type', 'zero-stage', 'offload-optimizer', 'offload-param',
+ 'zero-init', 'sharding-strategy', 'use-distributed-optimizer',
+ 'num-micro-batches', 'gradient-clipping', 'weight-decay', 'lr', 'warmup-steps',
+ // Hardware
+ 'num-gpus', 'gpu-model', 'gpu-mem-custom',
+ ];
+
+ // Add event listeners to all inputs
+ autoCalcInputs.forEach(id => {
+ const element = document.getElementById(id);
+ if (!element) return;
+
+ // Use 'change' event for selects and checkboxes
+ // Use 'input' event for text/number inputs
+ const eventType = (element.tagName === 'SELECT' ||
+ element.tagName === 'INPUT' &&
+ (element.type === 'checkbox' || element.type === 'range'))
+ ? 'input' : 'input';
+
+ element.addEventListener(eventType, () => {
+ this.scheduleAutoCalculate();
+ });
+ });
+ }
+
+ scheduleAutoCalculate() {
+ // Don't auto-calculate if currently applying a config (preset load)
+ if (this.isApplyingConfig) return;
+
+ // Don't auto-calculate if disabled
+ if (!this.autoCalculateEnabled) return;
+
+ // Check minimum time between calculations
+ const now = Date.now();
+ if (now - this.lastCalculationTime < this.minCalculationInterval) {
+ return; // Skip this calculation, too soon
+ }
+
+ // Clear existing timer
+ if (this.debounceTimer) {
+ clearTimeout(this.debounceTimer);
+ }
+
+ // Schedule new calculation
+ this.debounceTimer = setTimeout(() => {
+ this.calculateMemory();
+ }, this.debounceDelay);
+ }
+
+ /**
+ * Client-side validation before making API call
+ * Returns {valid: boolean, errors: string[]}
+ */
+ validateForm() {
+ const errors = [];
+
+ // Get form values
+ const tensorPP = parseInt(document.getElementById('tensor-pp').value) || 1;
+ const pipelinePP = parseInt(document.getElementById('pipeline-pp').value) || 1;
+ const dataPP = parseInt(document.getElementById('data-pp').value) || 1;
+ const numGPUs = parseInt(document.getElementById('num-gpus').value) || 1;
+ const seqParallel = document.getElementById('seq-parallel').checked;
+ const engineType = document.getElementById('engine-type').value;
+ const zeroStage = parseInt(document.getElementById('zero-stage').value) || 0;
+ const moeEnabled = document.getElementById('moe-enabled').checked;
+ const numExperts = parseInt(document.getElementById('num-experts').value) || 1;
+ const topK = parseInt(document.getElementById('top-k').value) || 1;
+
+ // Validate parallelism consistency
+ const effectiveGPUs = tensorPP * pipelinePP * dataPP;
+ if (effectiveGPUs !== numGPUs) {
+ errors.push(
+ `Parallelism mismatch: ${tensorPP}ร${pipelinePP}ร${dataPP}=${effectiveGPUs} GPUs, ` +
+ `but num_gpus=${numGPUs}. These must match.`
+ );
+ }
+
+ // Validate sequence parallel requires tensor parallel > 1
+ if (seqParallel && tensorPP <= 1) {
+ errors.push(
+ 'Sequence parallelism requires tensor_parallel_size > 1, ' +
+ `but tensor_pp=${tensorPP}.`
+ );
+ }
+
+ // Validate ZeRO stages only for DeepSpeed engines
+ if (zeroStage > 0 && !['deepspeed', 'megatron_deepspeed'].includes(engineType)) {
+ errors.push(
+ `ZeRO stages are only supported for DeepSpeed engines, ` +
+ `but engine_type='${engineType}' with zero_stage=${zeroStage}.`
+ );
+ }
+
+ // Validate MoE settings
+ if (moeEnabled) {
+ if (topK > numExperts) {
+ errors.push(
+ `MoE top_k (${topK}) cannot exceed num_experts (${numExperts}).`
+ );
+ }
+ if (numExperts < 1 || numExperts > 256) {
+ errors.push(`num_experts must be between 1 and 256, got ${numExperts}.`);
+ }
+ if (topK < 1 || topK > 8) {
+ errors.push(`top_k must be between 1 and 8, got ${topK}.`);
+ }
+ }
+
+ return {
+ valid: errors.length === 0,
+ errors: errors
+ };
+ }
+
+ /**
+ * Switch between tabs
+ */
+ switchTab(tabName) {
+ // Update tab buttons
+ document.querySelectorAll('.tab-btn').forEach(btn => {
+ btn.classList.remove('active');
+ if (btn.dataset.tab === tabName) {
+ btn.classList.add('active');
+ }
+ });
+
+ // Update tab content
+ document.querySelectorAll('.tab-content').forEach(content => {
+ content.classList.remove('active');
+ content.style.display = 'none';
+ });
+
+ const activeTab = document.getElementById(`${tabName}-tab`);
+ if (activeTab) {
+ activeTab.classList.add('active');
+ activeTab.style.display = 'block';
+ }
+ }
+
+ /**
+ * Initialize tab-specific event listeners
+ */
+ initTabListeners() {
+ // Inference tab event listeners
+ const infCalcBtn = document.getElementById('inference-calculate-btn');
+ const infResetBtn = document.getElementById('inference-reset-btn');
+ const infPresetSelect = document.getElementById('inference-preset-select');
+ if (infCalcBtn) {
+ infCalcBtn.addEventListener('click', () => this.calculateInferenceMemory());
+ }
+ if (infResetBtn) {
+ infResetBtn.addEventListener('click', () => this.resetInferenceForm());
+ }
+ if (infPresetSelect) {
+ infPresetSelect.addEventListener('change', (e) => {
+ if (e.target.value !== 'custom') {
+ this.loadInferencePreset(e.target.value);
+ }
+ });
+ }
+
+ // GPU memory utilization slider
+ const gpuMemUtilSlider = document.getElementById('gpu-memory-util');
+ const gpuMemUtilValue = document.getElementById('gpu-memory-util-value');
+ if (gpuMemUtilSlider && gpuMemUtilValue) {
+ gpuMemUtilSlider.addEventListener('input', (e) => {
+ gpuMemUtilValue.textContent = parseFloat(e.target.value).toFixed(2);
+ });
+ }
+
+ // Inference engine dropdown - show/hide engine-specific sections
+ const infEngineSelect = document.getElementById('inference-engine');
+ if (infEngineSelect) {
+ infEngineSelect.addEventListener('change', (e) => {
+ this.updateInferenceEngineFields(e.target.value);
+ });
+ // Initialize with default engine
+ this.updateInferenceEngineFields(infEngineSelect.value);
+ }
+
+ // Multi-node tab event listeners
+ const multiCalcBtn = document.getElementById('multinode-calculate-btn');
+ const multiResetBtn = document.getElementById('multinode-reset-btn');
+ const multiPresetSelect = document.getElementById('multinode-preset-select');
+ if (multiCalcBtn) {
+ multiCalcBtn.addEventListener('click', () => this.calculateMultiNode());
+ }
+ if (multiResetBtn) {
+ multiResetBtn.addEventListener('click', () => this.resetMultiNodeForm());
+ }
+ if (multiPresetSelect) {
+ multiPresetSelect.addEventListener('change', (e) => {
+ if (e.target.value !== 'custom') {
+ this.loadMultiNodePreset(e.target.value);
+ }
+ });
+ }
+
+ // Update total GPUs display
+ const numNodesInput = document.getElementById('num-nodes');
+ const gpusPerNodeInput = document.getElementById('gpus-per-node');
+ const totalGpusSpan = document.getElementById('multinode-total-gpus');
+
+ const updateTotalGpus = () => {
+ if (numNodesInput && gpusPerNodeInput && totalGpusSpan) {
+ const nodes = parseInt(numNodesInput.value) || 1;
+ const gpusPerNode = parseInt(gpusPerNodeInput.value) || 8;
+ totalGpusSpan.textContent = nodes * gpusPerNode;
+ }
+ };
+
+ if (numNodesInput) numNodesInput.addEventListener('input', updateTotalGpus);
+ if (gpusPerNodeInput) gpusPerNodeInput.addEventListener('input', updateTotalGpus);
+
+ // Export framework button
+ const exportBtn = document.getElementById('export-framework-btn');
+ if (exportBtn) {
+ exportBtn.addEventListener('click', () => this.showExportModal());
+ }
+ }
+
+ /**
+ * Load saved configs from localStorage
+ */
+ loadSavedConfigs() {
+ try {
+ const saved = localStorage.getItem('gpu-mem-saved-configs');
+ if (saved) {
+ this.savedConfigs = JSON.parse(saved);
+ }
+ } catch (e) {
+ console.warn('Failed to load saved configs:', e);
+ this.savedConfigs = [];
+ }
+ }
+
+ /**
+ * Save current config for comparison
+ */
+ saveConfigForComparison() {
+ const config = this.collectFormData();
+ const name = config.model.name || 'unnamed';
+
+ // Add timestamp
+ config.savedAt = new Date().toISOString();
+ config.id = Date.now();
+
+ this.savedConfigs.push(config);
+
+ // Limit to 10 saved configs
+ if (this.savedConfigs.length > 10) {
+ this.savedConfigs.shift();
+ }
+
+ // Save to localStorage
+ try {
+ localStorage.setItem('gpu-mem-saved-configs', JSON.stringify(this.savedConfigs));
+ this.showError(`Saved config: ${name}`, true);
+ } catch (e) {
+ this.showError('Failed to save config');
+ }
+ }
+
+ /**
+ * Show comparison modal/panel
+ */
+ showComparison(configId) {
+ const config = this.savedConfigs.find(c => c.id === configId);
+ if (!config) return;
+
+ const currentConfig = this.collectFormData();
+
+ // Create comparison HTML
+ const comparisonHTML = this.generateComparisonHTML(currentConfig, config);
+
+ // Show in modal (you'll need to add modal HTML to index.html)
+ alert('Comparison feature - modal will be added');
+ }
+
+ /**
+ * Generate HTML for comparison view
+ */
+ generateComparisonHTML(config1, config2) {
+ // Calculate memory for both configs
+ // For now, just return placeholder
+ return `
+ Configuration Comparison
+
+
+
Current Config
+
${JSON.stringify(config1, null, 2)}
+
+
+
Saved Config
+
${JSON.stringify(config2, null, 2)}
+
+
+ `;
+ }
+
+ setAutoCalculate(enabled) {
+ this.autoCalculateEnabled = enabled;
+ }
+
+ async loadPreset(presetName) {
+ try {
+ const response = await fetch(`${this.apiBase}/preset/${presetName}`);
+ if (!response.ok) {
+ throw new Error(`Failed to load preset: ${presetName}`);
+ }
+
+ const config = await response.json();
+ this.applyConfig(config);
+ } catch (error) {
+ this.showError(`Failed to load preset: ${error.message}`);
+ }
+ }
+
+ async loadInferencePreset(presetName) {
+ try {
+ const response = await fetch(`${this.apiBase}/preset/${presetName}`);
+ if (!response.ok) {
+ throw new Error(`Failed to load preset: ${presetName}`);
+ }
+
+ const config = await response.json();
+ this.applyInferenceConfig(config);
+ } catch (error) {
+ this.showError(`Failed to load preset: ${error.message}`);
+ }
+ }
+
+ async loadMultiNodePreset(presetName) {
+ try {
+ const response = await fetch(`${this.apiBase}/preset/${presetName}`);
+ if (!response.ok) {
+ throw new Error(`Failed to load preset: ${presetName}`);
+ }
+
+ const config = await response.json();
+ this.applyMultiNodeConfig(config);
+ } catch (error) {
+ this.showError(`Failed to load preset: ${error.message}`);
+ }
+ }
+
+ applyConfig(config) {
+ // Set flag to prevent auto-calculation during config load
+ this.isApplyingConfig = true;
+
+ // Apply model configuration
+ if (config.model) {
+ if (config.model.name) document.getElementById('model-name').value = config.model.name;
+ if (config.model.num_parameters) document.getElementById('num-params').value = config.model.num_parameters;
+ if (config.model.num_layers) document.getElementById('num-layers').value = config.model.num_layers;
+ if (config.model.hidden_size) document.getElementById('hidden-size').value = config.model.hidden_size;
+ if (config.model.num_attention_heads) document.getElementById('num-heads').value = config.model.num_attention_heads;
+ if (config.model.vocab_size) document.getElementById('vocab-size').value = config.model.vocab_size;
+ if (config.model.max_seq_len) document.getElementById('seq-len').value = config.model.max_seq_len;
+ }
+
+ // Apply MoE configuration
+ if (config.model.moe_enabled !== undefined) {
+ document.getElementById('moe-enabled').checked = config.model.moe_enabled;
+ this.toggleMoEFields(config.model.moe_enabled);
+
+ if (config.model.moe_enabled) {
+ if (config.model.num_experts) {
+ document.getElementById('num-experts').value = config.model.num_experts;
+ }
+ if (config.model.top_k) {
+ document.getElementById('top-k').value = config.model.top_k;
+ }
+ if (config.model.expert_intermediate_size) {
+ document.getElementById('expert-intermediate-size').value = config.model.expert_intermediate_size;
+ }
+ if (config.model.shared_expert_intermediate_size) {
+ document.getElementById('shared-expert-size').value = config.model.shared_expert_intermediate_size;
+ }
+ this.updateMoEDisplay();
+ }
+ }
+
+ // Apply training configuration
+ if (config.training) {
+ if (config.training.batch_size) {
+ document.getElementById('batch-size').value = config.training.batch_size;
+ document.getElementById('batch-size-slider').value = config.training.batch_size;
+ }
+ if (config.training.gradient_accumulation_steps) {
+ document.getElementById('grad-accum').value = config.training.gradient_accumulation_steps;
+ }
+ if (config.training.optimizer) document.getElementById('optimizer').value = config.training.optimizer;
+ if (config.training.dtype) document.getElementById('dtype').value = config.training.dtype;
+ if (config.training.activation_checkpointing !== undefined) {
+ document.getElementById('activation-checkpointing').value = config.training.activation_checkpointing;
+ }
+ }
+
+ // Apply parallelism configuration
+ if (config.parallelism) {
+ if (config.parallelism.tensor_parallel_size) {
+ document.getElementById('tensor-pp').value = config.parallelism.tensor_parallel_size;
+ }
+ if (config.parallelism.pipeline_parallel_size) {
+ document.getElementById('pipeline-pp').value = config.parallelism.pipeline_parallel_size;
+ }
+ if (config.parallelism.data_parallel_size) {
+ document.getElementById('data-pp').value = config.parallelism.data_parallel_size;
+ }
+ if (config.parallelism.sequence_parallel) {
+ document.getElementById('seq-parallel').checked = config.parallelism.sequence_parallel;
+ }
+ }
+
+ // Apply engine configuration
+ if (config.engine) {
+ if (config.engine.type) {
+ document.getElementById('engine-type').value = config.engine.type;
+ this.updateEngineFields(config.engine.type);
+ }
+ if (config.engine.zero_stage !== undefined) {
+ document.getElementById('zero-stage').value = config.engine.zero_stage;
+ }
+ if (config.engine.offload_optimizer) {
+ document.getElementById('offload-optimizer').value = config.engine.offload_optimizer;
+ }
+ if (config.engine.offload_param) {
+ document.getElementById('offload-param').value = config.engine.offload_param;
+ }
+ }
+
+ // Apply hardware configuration
+ if (config.hardware) {
+ if (config.hardware.num_gpus) document.getElementById('num-gpus').value = config.hardware.num_gpus;
+ if (config.hardware.gpu_memory_gb) {
+ document.getElementById('gpu-model').value = config.hardware.gpu_memory_gb;
+ document.getElementById('gpu-mem-custom').value = config.hardware.gpu_memory_gb;
+ }
+ }
+
+ this.updateEffectiveGPUs();
+
+ // Re-enable auto-calculation and trigger calculation
+ setTimeout(() => {
+ this.isApplyingConfig = false;
+ this.calculateMemory();
+ }, 100);
+ }
+
+ updateEngineFields(engineType) {
+ const zeroStageGroup = document.getElementById('zero-stage-group');
+ const offloadOptGroup = document.getElementById('offload-opt-group');
+ const offloadParamGroup = document.getElementById('offload-param-group');
+ const zeroInitGroup = document.getElementById('zero-init-group');
+ const shardingStrategyGroup = document.getElementById('sharding-strategy-group');
+ const megatronOptions = document.getElementById('megatron-options');
+
+ // Hide all first
+ zeroStageGroup.style.display = 'none';
+ offloadOptGroup.style.display = 'none';
+ offloadParamGroup.style.display = 'none';
+ zeroInitGroup.style.display = 'none';
+ shardingStrategyGroup.style.display = 'none';
+ megatronOptions.style.display = 'none';
+
+ // Show/hide fields based on engine type
+ switch (engineType) {
+ case 'deepspeed':
+ case 'megatron_deepspeed':
+ zeroStageGroup.style.display = 'block';
+ offloadOptGroup.style.display = 'block';
+ offloadParamGroup.style.display = 'block';
+ zeroInitGroup.style.display = 'block';
+ break;
+ case 'pytorch_ddp':
+ case 'megatron_lm':
+ // No special options
+ break;
+ case 'fsdp':
+ shardingStrategyGroup.style.display = 'block';
+ break;
+ }
+
+ // Show Megatron options for Megatron engines
+ if (engineType === 'megatron_lm' || engineType === 'megatron_deepspeed') {
+ megatronOptions.style.display = 'block';
+ }
+ }
+
+ updateEffectiveGPUs() {
+ const tensorPP = parseInt(document.getElementById('tensor-pp').value) || 1;
+ const pipelinePP = parseInt(document.getElementById('pipeline-pp').value) || 1;
+ const dataPP = parseInt(document.getElementById('data-pp').value) || 1;
+
+ const effectiveGPUs = tensorPP * pipelinePP * dataPP;
+ document.getElementById('effective-gpus').textContent = effectiveGPUs;
+ }
+
+ toggleMoEFields(enabled) {
+ const moeFields = document.getElementById('moe-fields');
+ moeFields.style.display = enabled ? 'block' : 'none';
+ if (enabled) {
+ this.updateMoEDisplay();
+ }
+ }
+
+ updateMoEDisplay() {
+ const numExperts = parseInt(document.getElementById('num-experts').value) || 8;
+ const topK = parseInt(document.getElementById('top-k').value) || 2;
+
+ document.getElementById('total-experts-display').textContent = numExperts;
+ document.getElementById('active-experts-display').textContent = topK;
+ }
+
+ updateInferenceEngineFields(engineType) {
+ const tgiSettings = document.getElementById('tgi-settings');
+ const vllmSettings = document.getElementById('vllm-settings');
+ const tensorrtSettings = document.getElementById('tensorrt-settings');
+ const sglangSettings = document.getElementById('sglang-settings');
+
+ // Hide all engine-specific sections first
+ if (tgiSettings) tgiSettings.style.display = 'none';
+ if (vllmSettings) vllmSettings.style.display = 'none';
+ if (tensorrtSettings) tensorrtSettings.style.display = 'none';
+ if (sglangSettings) sglangSettings.style.display = 'none';
+
+ // Show relevant section based on engine type
+ switch (engineType) {
+ case 'tgi':
+ if (tgiSettings) tgiSettings.style.display = 'block';
+ break;
+ case 'vllm':
+ if (vllmSettings) vllmSettings.style.display = 'block';
+ break;
+ case 'tensorrt_llm':
+ if (tensorrtSettings) tensorrtSettings.style.display = 'block';
+ break;
+ case 'sglang':
+ if (sglangSettings) sglangSettings.style.display = 'block';
+ break;
+ case 'huggingface':
+ default:
+ // No additional settings for HuggingFace
+ break;
+ }
+ }
+
+ collectFormData() {
+ // Get GPU memory value
+ let gpuMem = document.getElementById('gpu-model').value;
+ if (gpuMem === 'custom') {
+ gpuMem = parseFloat(document.getElementById('gpu-mem-custom').value);
+ } else {
+ gpuMem = parseFloat(gpuMem);
+ }
+
+ // Get engine type
+ const engineType = document.getElementById('engine-type').value;
+
+ // Get MoE parameters
+ const moeEnabled = document.getElementById('moe-enabled').checked;
+ const expertIntermediateSize = document.getElementById('expert-intermediate-size').value;
+ const sharedExpertSize = document.getElementById('shared-expert-size').value;
+
+ return {
+ model: {
+ name: document.getElementById('model-name').value,
+ num_parameters: document.getElementById('num-params').value,
+ num_layers: parseInt(document.getElementById('num-layers').value),
+ hidden_size: parseInt(document.getElementById('hidden-size').value),
+ num_attention_heads: parseInt(document.getElementById('num-heads').value),
+ vocab_size: parseInt(document.getElementById('vocab-size').value),
+ max_seq_len: parseInt(document.getElementById('seq-len').value),
+ moe_enabled: moeEnabled,
+ num_experts: moeEnabled ? parseInt(document.getElementById('num-experts').value) : 1,
+ top_k: moeEnabled ? parseInt(document.getElementById('top-k').value) : 1,
+ expert_intermediate_size: expertIntermediateSize ? parseInt(expertIntermediateSize) : null,
+ shared_expert_intermediate_size: sharedExpertSize ? parseInt(sharedExpertSize) : null,
+ },
+ training: {
+ batch_size: parseInt(document.getElementById('batch-size').value),
+ gradient_accumulation_steps: parseInt(document.getElementById('grad-accum').value),
+ optimizer: document.getElementById('optimizer').value,
+ dtype: document.getElementById('dtype').value,
+ activation_checkpointing: parseInt(document.getElementById('activation-checkpointing').value),
+ },
+ parallelism: {
+ tensor_parallel_size: parseInt(document.getElementById('tensor-pp').value),
+ pipeline_parallel_size: parseInt(document.getElementById('pipeline-pp').value),
+ data_parallel_size: parseInt(document.getElementById('data-pp').value),
+ sequence_parallel: document.getElementById('seq-parallel').checked,
+ },
+ engine: {
+ type: engineType,
+ zero_stage: parseInt(document.getElementById('zero-stage').value),
+ offload_optimizer: document.getElementById('offload-optimizer').value,
+ offload_param: document.getElementById('offload-param').value,
+ zero_init: document.getElementById('zero-init').checked,
+ sharding_strategy: document.getElementById('sharding-strategy')?.value || null,
+ use_distributed_optimizer: document.getElementById('use-distributed-optimizer')?.checked || false,
+ num_micro_batches: parseInt(document.getElementById('num-micro-batches')?.value || 1),
+ },
+ hardware: {
+ num_gpus: parseInt(document.getElementById('num-gpus').value),
+ gpu_memory_gb: gpuMem,
+ },
+ };
+ }
+
+ async calculateMemory() {
+ // Client-side validation first
+ const validation = this.validateForm();
+ if (!validation.valid) {
+ // Show validation errors inline
+ this.showError(`Validation error: ${validation.errors[0]}`);
+ return;
+ }
+
+ const config = this.collectFormData();
+ this.lastConfig = config; // Store for formula explanation
+ const calculateBtn = document.getElementById('calculate-btn');
+
+ // Update last calculation time
+ this.lastCalculationTime = Date.now();
+
+ // Show loading state
+ calculateBtn.disabled = true;
+ calculateBtn.textContent = 'Calculating...';
+
+ try {
+ const response = await fetch(`${this.apiBase}/calculate`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(config),
+ });
+
+ if (!response.ok) {
+ const error = await response.json();
+ const errorMsg = error.detail?.message || error.detail || 'Calculation failed';
+ throw new Error(errorMsg);
+ }
+
+ const result = await response.json();
+ this.displayResults(result);
+ } catch (error) {
+ this.showError(`Calculation failed: ${error.message}`);
+ } finally {
+ calculateBtn.disabled = false;
+ calculateBtn.textContent = 'Calculate';
+ }
+ }
+
+ displayResults(result) {
+ // Main memory results
+ document.getElementById('result-per-gpu').textContent = `${result.total_memory_per_gpu_gb.toFixed(2)} GB`;
+ document.getElementById('result-total').textContent = `${result.total_memory_all_gpus_gb.toFixed(2)} GB`;
+ document.getElementById('result-cpu').textContent = `${result.cpu_memory_gb.toFixed(2)} GB`;
+
+ // Breakdown
+ document.getElementById('breakdown-params').textContent = `${result.breakdown.model_params_gb.toFixed(2)} GB`;
+ document.getElementById('breakdown-grads').textContent = `${result.breakdown.gradients_gb.toFixed(2)} GB`;
+ document.getElementById('breakdown-optimizer').textContent = `${result.breakdown.optimizer_states_gb.toFixed(2)} GB`;
+ document.getElementById('breakdown-activations').textContent = `${result.breakdown.activations_gb.toFixed(2)} GB`;
+ document.getElementById('breakdown-overhead').textContent = `${result.breakdown.overhead_gb.toFixed(2)} GB`;
+
+ // Update bar chart
+ this.updateBarChart(result.breakdown);
+
+ // Feasibility
+ const statusEl = document.getElementById('feasibility-status');
+ const utilEl = document.getElementById('feasibility-util');
+ const recommendedBatchEl = document.getElementById('recommended-batch-container');
+ const recommendedBatchValue = document.getElementById('recommended-batch');
+
+ utilEl.textContent = `${result.memory_utilization_percent.toFixed(1)}%`;
+
+ if (result.fits_on_gpu) {
+ statusEl.textContent = 'โ Fits on GPU';
+ statusEl.className = 'metric-value status-success';
+ recommendedBatchEl.style.display = 'none';
+ } else {
+ statusEl.textContent = 'โ OOM (Out of Memory)';
+ statusEl.className = 'metric-value status-danger';
+ if (result.recommended_batch_size) {
+ recommendedBatchValue.textContent = result.recommended_batch_size;
+ recommendedBatchEl.style.display = 'flex';
+ }
+ }
+
+ // Color code utilization
+ if (result.memory_utilization_percent < 80) {
+ utilEl.className = 'metric-value status-success';
+ } else if (result.memory_utilization_percent < 95) {
+ utilEl.className = 'metric-value status-warning';
+ } else {
+ utilEl.className = 'metric-value status-danger';
+ }
+ }
+
+ updateBarChart(breakdown) {
+ const total = breakdown.model_params_gb + breakdown.gradients_gb +
+ breakdown.optimizer_states_gb + breakdown.activations_gb;
+
+ const paramsPct = (breakdown.model_params_gb / total) * 100;
+ const gradsPct = (breakdown.gradients_gb / total) * 100;
+ const optimizerPct = (breakdown.optimizer_states_gb / total) * 100;
+ const activationsPct = (breakdown.activations_gb / total) * 100;
+
+ document.getElementById('bar-params').style.width = `${paramsPct}%`;
+ document.getElementById('bar-grads').style.width = `${gradsPct}%`;
+ document.getElementById('bar-optimizer').style.width = `${optimizerPct}%`;
+ document.getElementById('bar-activations').style.width = `${activationsPct}%`;
+ }
+
+ async showFormulaExplanation() {
+ if (!this.lastConfig) {
+ this.showError('Please run a calculation first to see the formula explanation.');
+ return;
+ }
+
+ try {
+ const response = await fetch(`${this.apiBase}/explain-formula`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(this.lastConfig),
+ });
+
+ if (!response.ok) {
+ throw new Error('Failed to get formula explanation');
+ }
+
+ const formulaInfo = await response.json();
+ this.displayFormulaExplanation(formulaInfo);
+ } catch (error) {
+ this.showError(`Failed to load formula explanation: ${error.message}`);
+ }
+ }
+
+ displayFormulaExplanation(formulaInfo) {
+ // Update formula description
+ const descEl = document.getElementById('formula-description');
+ descEl.innerHTML = `
+ Engine: ${formulaInfo.engine_name}
+ Total Memory: ${formulaInfo.total_memory_gb} GB
+ ${formulaInfo.formula_description || ''}
+ `;
+
+ // Update formula components
+ const componentsEl = document.getElementById('formula-components');
+ componentsEl.style.display = 'block';
+
+ let componentsHTML = 'Formula Components:
';
+ componentsEl.innerHTML = componentsHTML;
+
+ // Update references
+ const refsEl = document.getElementById('references-list');
+ const refsContainer = document.querySelector('.formula-references');
+ refsContainer.style.display = 'block';
+
+ let refsHTML = '';
+ formulaInfo.references.forEach(ref => {
+ refsHTML += `${ref.title}`;
+ });
+ refsEl.innerHTML = refsHTML;
+
+ // Update button text and set visibility flag
+ const btn = document.getElementById('show-formula-btn');
+ btn.textContent = 'Hide Formula Details';
+ this.formulaDetailsVisible = true;
+ }
+
+ hideFormulaExplanation() {
+ document.getElementById('formula-components').style.display = 'none';
+ document.querySelector('.formula-references').style.display = 'none';
+
+ const btn = document.getElementById('show-formula-btn');
+ btn.textContent = 'Show Formula Details';
+ this.formulaDetailsVisible = false;
+ }
+
+ async toggleFormulaExplanation() {
+ if (!this.lastConfig) {
+ this.showError('Please run a calculation first to see the formula explanation.');
+ return;
+ }
+
+ if (this.formulaDetailsVisible) {
+ // Currently visible, hide it
+ this.hideFormulaExplanation();
+ } else {
+ // Currently hidden, show it
+ await this.showFormulaExplanation();
+ }
+ }
+
+ resetForm() {
+ document.getElementById('preset-select').value = 'custom';
+ document.getElementById('model-name').value = 'custom-model';
+ document.getElementById('num-params').value = '7B';
+ document.getElementById('num-layers').value = '32';
+ document.getElementById('hidden-size').value = '4096';
+ document.getElementById('num-heads').value = '32';
+ document.getElementById('vocab-size').value = '32000';
+ document.getElementById('seq-len').value = '4096';
+
+ // Reset MoE fields
+ document.getElementById('moe-enabled').checked = false;
+ document.getElementById('num-experts').value = '8';
+ document.getElementById('top-k').value = '2';
+ document.getElementById('expert-intermediate-size').value = '';
+ document.getElementById('shared-expert-size').value = '';
+ this.toggleMoEFields(false);
+
+ document.getElementById('batch-size').value = '4';
+ document.getElementById('batch-size-slider').value = '4';
+ document.getElementById('grad-accum').value = '4';
+ document.getElementById('optimizer').value = 'adamw';
+ document.getElementById('dtype').value = 'bf16';
+ document.getElementById('activation-checkpointing').value = '2';
+ document.getElementById('tensor-pp').value = '1';
+ document.getElementById('pipeline-pp').value = '1';
+ document.getElementById('data-pp').value = '8';
+ document.getElementById('seq-parallel').checked = false;
+ document.getElementById('engine-type').value = 'deepspeed';
+ document.getElementById('zero-stage').value = '3';
+ document.getElementById('offload-optimizer').value = 'cpu';
+ document.getElementById('offload-param').value = 'none';
+ document.getElementById('zero-init').checked = true;
+ document.getElementById('num-gpus').value = '8';
+ document.getElementById('gpu-model').value = '80';
+
+ this.updateEngineFields('deepspeed');
+ this.updateEffectiveGPUs();
+
+ // Reset results
+ document.getElementById('result-per-gpu').textContent = '-- GB';
+ document.getElementById('result-total').textContent = '-- GB';
+ document.getElementById('result-cpu').textContent = '-- GB';
+ document.getElementById('breakdown-params').textContent = '-- GB';
+ document.getElementById('breakdown-grads').textContent = '-- GB';
+ document.getElementById('breakdown-optimizer').textContent = '-- GB';
+ document.getElementById('breakdown-activations').textContent = '-- GB';
+ document.getElementById('breakdown-overhead').textContent = '-- GB';
+ document.getElementById('feasibility-status').textContent = '--';
+ document.getElementById('feasibility-util').textContent = '--%';
+ }
+
+ saveConfig() {
+ const config = this.collectFormData();
+ const jsonStr = JSON.stringify(config, null, 2);
+ const blob = new Blob([jsonStr], { type: 'application/json' });
+ const url = URL.createObjectURL(blob);
+
+ const a = document.createElement('a');
+ a.href = url;
+ a.download = `gpu-mem-config-${Date.now()}.json`;
+ document.body.appendChild(a);
+ a.click();
+ document.body.removeChild(a);
+ URL.revokeObjectURL(url);
+ }
+
+ async copyConfigJSON() {
+ const config = this.collectFormData();
+ const jsonStr = JSON.stringify(config, null, 2);
+
+ try {
+ await navigator.clipboard.writeText(jsonStr);
+ this.showError('Config copied to clipboard!', true);
+ } catch (error) {
+ // Fallback for older browsers
+ const textarea = document.createElement('textarea');
+ textarea.value = jsonStr;
+ document.body.appendChild(textarea);
+ textarea.select();
+ document.execCommand('copy');
+ document.body.removeChild(textarea);
+ this.showError('Config copied to clipboard!', true);
+ }
+ }
+
+ showError(message, isSuccess = false) {
+ const errorEl = document.getElementById('error-message');
+ errorEl.textContent = message;
+ errorEl.style.display = 'block';
+ errorEl.style.backgroundColor = isSuccess ? 'var(--success-color)' : 'var(--danger-color)';
+
+ setTimeout(() => {
+ errorEl.style.display = 'none';
+ }, 3000);
+ }
+
+ /**
+ * Calculate inference memory
+ */
+ async calculateInferenceMemory() {
+ try {
+ // Helper function to get value or null if empty
+ const getValOrNull = (id) => {
+ const val = document.getElementById(id).value;
+ return val === '' ? null : val;
+ };
+
+ const getIntOrNull = (id) => {
+ const val = document.getElementById(id).value;
+ return val === '' ? null : parseInt(val);
+ };
+
+ const getFloatOrNull = (id) => {
+ const val = document.getElementById(id).value;
+ return val === '' ? null : parseFloat(val);
+ };
+
+ const config = {
+ model: {
+ name: document.getElementById('inference-model-name').value,
+ num_parameters: document.getElementById('inference-num-params').value,
+ num_layers: parseInt(document.getElementById('inference-num-layers').value),
+ hidden_size: parseInt(document.getElementById('inference-hidden-size').value),
+ num_attention_heads: parseInt(document.getElementById('inference-num-heads').value),
+ vocab_size: parseInt(document.getElementById('inference-vocab-size').value),
+ max_seq_len: parseInt(document.getElementById('inference-seq-len').value),
+ },
+ inference: {
+ engine_type: document.getElementById('inference-engine').value,
+ batch_size: parseInt(document.getElementById('inference-batch-size').value),
+ kv_cache_quantization: document.getElementById('kv-cache-quantization').value,
+ tensor_parallel_size: parseInt(document.getElementById('tensor-parallel-size').value),
+ gpu_memory_utilization: parseFloat(document.getElementById('gpu-memory-util').value),
+ use_kv_cache: document.getElementById('use-kv-cache').checked,
+ // TGI-specific
+ max_total_tokens: getIntOrNull('max-total-tokens'),
+ max_input_tokens: getIntOrNull('max-input-tokens'),
+ max_batch_total_tokens: getIntOrNull('max-batch-total-tokens'),
+ tgi_quantize: getValOrNull('tgi-quantize') || 'none',
+ tgi_dtype: getValOrNull('tgi-dtype') || 'bfloat16',
+ sharded: document.getElementById('sharded').checked,
+ num_shard: getIntOrNull('num-shard'),
+ // vLLM-specific
+ block_size: getIntOrNull('block-size'),
+ swap_space_gb: getFloatOrNull('swap-space-gb') || 0.0,
+ enable_prefix_caching: document.getElementById('enable-prefix-caching').checked,
+ enforce_eager: document.getElementById('enforce-eager').checked,
+ max_num_batched_tokens: getIntOrNull('max-num-batched-tokens'),
+ max_num_seqs: getIntOrNull('max-num-seqs'),
+ vllm_quantization: getValOrNull('vllm-quantization') || 'none',
+ // TensorRT-LLM-specific
+ trt_max_batch_size: getIntOrNull('trt-max-batch-size'),
+ trt_max_input_len: getIntOrNull('trt-max-input-len'),
+ trt_max_seq_len: getIntOrNull('trt-max-seq-len'),
+ trt_max_beam_width: getIntOrNull('trt-max-beam-width'),
+ // SGLang-specific
+ chunk_size: getIntOrNull('chunk-size'),
+ max_running_requests: getIntOrNull('max-running-requests'),
+ disable_radix_cache: document.getElementById('disable-radix-cache').checked,
+ enable_p2p: document.getElementById('enable-p2p').checked,
+ disable_custom_all_reduce: document.getElementById('disable-custom-all-reduce').checked,
+ attention_backend: getValOrNull('attention-backend') || 'flashinfer',
+ enable_torch_compile: document.getElementById('enable-torch-compile').checked,
+ radix_cache_max_seq_len: getIntOrNull('radix-cache-max-seq-len'),
+ speculative_algo: getValOrNull('speculative-algo') || 'default',
+ multi_lora_enabled: document.getElementById('multi-lora-enabled').checked,
+ },
+ hardware: {
+ num_gpus: parseInt(document.getElementById('inference-num-gpus').value),
+ gpu_memory_gb: parseInt(document.getElementById('inference-gpu-model').value),
+ },
+ };
+
+ const response = await fetch(`${this.apiBase}/inference/calculate`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(config),
+ });
+
+ if (!response.ok) {
+ throw new Error('Failed to calculate inference memory');
+ }
+
+ const result = await response.json();
+ this.displayInferenceResults(result);
+ } catch (error) {
+ this.showError(`Error: ${error.message}`);
+ }
+ }
+
+ displayInferenceResults(result) {
+ document.getElementById('inference-result-per-gpu').textContent = `${result.total_memory_per_gpu_gb.toFixed(2)} GB`;
+ document.getElementById('inference-result-total').textContent = `${result.total_memory_all_gpus_gb.toFixed(2)} GB`;
+ document.getElementById('inference-result-params').textContent = `${result.breakdown.model_params_gb.toFixed(2)} GB`;
+ document.getElementById('inference-result-kv-cache').textContent = `${result.breakdown.kv_cache_gb.toFixed(2)} GB`;
+ document.getElementById('inference-result-activations').textContent = `${result.breakdown.activations_gb.toFixed(2)} GB`;
+ document.getElementById('inference-max-batch').textContent = result.max_supported_batch_size || 'N/A';
+ document.getElementById('inference-throughput').textContent = result.estimated_throughput_tokens_per_sec
+ ? `${result.estimated_throughput_tokens_per_sec.toFixed(0)} tokens/sec`
+ : 'N/A';
+ document.getElementById('inference-fits').textContent = result.fits_on_gpu ? 'โ Yes' : 'โ No';
+ document.getElementById('inference-fits').style.color = result.fits_on_gpu ? 'var(--success-color)' : 'var(--danger-color)';
+ document.getElementById('inference-utilization').textContent = `${result.memory_utilization_percent.toFixed(1)}%`;
+ }
+
+ resetInferenceForm() {
+ document.getElementById('inference-preset-select').value = 'custom';
+ document.getElementById('inference-model-name').value = 'custom-model';
+ document.getElementById('inference-num-params').value = '7B';
+ document.getElementById('inference-num-layers').value = '32';
+ document.getElementById('inference-hidden-size').value = '4096';
+ document.getElementById('inference-num-heads').value = '32';
+ document.getElementById('inference-vocab-size').value = '32000';
+ document.getElementById('inference-seq-len').value = '4096';
+ document.getElementById('inference-batch-size').value = '32';
+ document.getElementById('kv-cache-quantization').value = 'none';
+ document.getElementById('tensor-parallel-size').value = '1';
+ document.getElementById('gpu-memory-util').value = '0.9';
+ document.getElementById('gpu-memory-util-value').textContent = '0.90';
+ document.getElementById('inference-num-gpus').value = '1';
+ document.getElementById('inference-gpu-model').value = '80';
+ document.getElementById('use-kv-cache').checked = true;
+
+ // Reset TGI-specific fields
+ document.getElementById('max-total-tokens').value = '4096';
+ document.getElementById('max-input-tokens').value = '2048';
+ document.getElementById('max-batch-total-tokens').value = '8192';
+ document.getElementById('tgi-quantize').value = 'none';
+ document.getElementById('tgi-dtype').value = 'bfloat16';
+ document.getElementById('sharded').checked = false;
+ document.getElementById('num-shard').value = '1';
+
+ // Reset vLLM-specific fields
+ document.getElementById('block-size').value = '';
+ document.getElementById('swap-space-gb').value = '0';
+ document.getElementById('enable-prefix-caching').checked = false;
+ document.getElementById('enforce-eager').checked = false;
+ document.getElementById('max-num-batched-tokens').value = '';
+ document.getElementById('max-num-seqs').value = '';
+ document.getElementById('vllm-quantization').value = 'none';
+
+ // Reset TensorRT-LLM-specific fields
+ document.getElementById('trt-max-batch-size').value = '2048';
+ document.getElementById('trt-max-input-len').value = '1024';
+ document.getElementById('trt-max-seq-len').value = '2048';
+ document.getElementById('trt-max-beam-width').value = '1';
+
+ // Reset SGLang-specific fields
+ document.getElementById('chunk-size').value = '8192';
+ document.getElementById('max-running-requests').value = '128';
+ document.getElementById('radix-cache-max-seq-len').value = '8192';
+ document.getElementById('attention-backend').value = 'flashinfer';
+ document.getElementById('speculative-algo').value = 'default';
+ document.getElementById('disable-radix-cache').checked = false;
+ document.getElementById('enable-p2p').checked = false;
+ document.getElementById('disable-custom-all-reduce').checked = false;
+ document.getElementById('enable-torch-compile').checked = false;
+ document.getElementById('multi-lora-enabled').checked = false;
+
+ // Clear results
+ document.getElementById('inference-result-per-gpu').textContent = '-- GB';
+ document.getElementById('inference-result-total').textContent = '-- GB';
+ document.getElementById('inference-result-params').textContent = '-- GB';
+ document.getElementById('inference-result-kv-cache').textContent = '-- GB';
+ document.getElementById('inference-result-activations').textContent = '-- GB';
+ document.getElementById('inference-max-batch').textContent = '--';
+ document.getElementById('inference-throughput').textContent = '-- tokens/sec';
+ document.getElementById('inference-fits').textContent = '--';
+ document.getElementById('inference-utilization').textContent = '--%';
+
+ // Reset engine-specific sections visibility
+ const engineType = document.getElementById('inference-engine').value;
+ this.updateInferenceEngineFields(engineType);
+ }
+
+ /**
+ * Calculate multi-node network overhead
+ */
+ async calculateMultiNode() {
+ try {
+ const config = {
+ model: {
+ num_parameters: document.getElementById('multinode-num-params').value,
+ },
+ training: {
+ dtype: document.getElementById('multinode-dtype').value,
+ batch_size: parseInt(document.getElementById('multinode-batch-size').value),
+ seq_length: parseInt(document.getElementById('multinode-seq-len').value),
+ },
+ parallelism: {
+ tensor_parallel_size: parseInt(document.getElementById('multinode-tensor-pp').value),
+ pipeline_parallel_size: parseInt(document.getElementById('multinode-pipeline-pp').value),
+ sequence_parallel: document.getElementById('multinode-seq-parallel').checked,
+ },
+ engine: {
+ type: document.getElementById('multinode-engine').value,
+ zero_stage: parseInt(document.getElementById('multinode-zero-stage').value),
+ },
+ node_config: {
+ num_nodes: parseInt(document.getElementById('num-nodes').value),
+ gpus_per_node: parseInt(document.getElementById('gpus-per-node').value),
+ interconnect_type: document.getElementById('interconnect-type').value,
+ },
+ optimize_strategy: document.getElementById('multinode-optimize').checked,
+ };
+
+ const response = await fetch(`${this.apiBase}/multinode/calculate`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(config),
+ });
+
+ if (!response.ok) {
+ throw new Error('Failed to calculate multi-node overhead');
+ }
+
+ const result = await response.json();
+ this.displayMultiNodeResults(result);
+ } catch (error) {
+ this.showError(`Error: ${error.message}`);
+ }
+ }
+
+ displayMultiNodeResults(result) {
+ const overhead = result.network_overhead;
+ document.getElementById('multinode-overhead-total').textContent = `${overhead.total_overhead_gb.toFixed(2)} GB`;
+ document.getElementById('multinode-overhead-allreduce').textContent = `${overhead.allreduce_gb.toFixed(2)} GB`;
+ document.getElementById('multinode-overhead-allgather').textContent = `${overhead.allgather_gb.toFixed(2)} GB`;
+ document.getElementById('multinode-overhead-reducescatter').textContent = `${overhead.reducescatter_gb?.toFixed(2) || '0.00'} GB`;
+ document.getElementById('multinode-overhead-pipeline').textContent = `${overhead.pipeline_gb?.toFixed(2) || '0.00'} GB`;
+ document.getElementById('multinode-time-overhead').textContent = `${overhead.estimated_overhead_ms_per_step?.toFixed(2) || 'N/A'} ms/step`;
+ document.getElementById('multinode-comm-time').textContent = `${overhead.communication_time_ms_per_step?.toFixed(2) || 'N/A'} ms/step`;
+ document.getElementById('multinode-latency').textContent = `${overhead.latency_overhead_ms?.toFixed(2) || 'N/A'} ms`;
+
+ // Display suggestions
+ const suggestionsDiv = document.getElementById('multinode-suggestions');
+ if (result.suggestions && result.suggestions.length > 0) {
+ suggestionsDiv.innerHTML = '' + result.suggestions.map(s => `- ${s}
`).join('') + '
';
+ } else {
+ suggestionsDiv.innerHTML = 'No optimization suggestions available.
';
+ }
+ }
+
+ resetMultiNodeForm() {
+ document.getElementById('multinode-preset-select').value = 'custom';
+ document.getElementById('multinode-num-params').value = '7B';
+ document.getElementById('multinode-dtype').value = 'bf16';
+ document.getElementById('num-nodes').value = '2';
+ document.getElementById('gpus-per-node').value = '8';
+ document.getElementById('multinode-total-gpus').textContent = '16';
+ document.getElementById('interconnect-type').value = 'infiniband';
+ document.getElementById('multinode-engine').value = 'deepspeed';
+ document.getElementById('multinode-zero-stage').value = '3';
+ document.getElementById('multinode-batch-size').value = '4';
+ document.getElementById('multinode-seq-len').value = '4096';
+ document.getElementById('multinode-tensor-pp').value = '1';
+ document.getElementById('multinode-pipeline-pp').value = '1';
+ document.getElementById('multinode-seq-parallel').checked = false;
+ document.getElementById('multinode-optimize').checked = true;
+
+ // Clear results
+ document.getElementById('multinode-overhead-total').textContent = '-- GB';
+ document.getElementById('multinode-overhead-allreduce').textContent = '-- GB';
+ document.getElementById('multinode-overhead-allgather').textContent = '-- GB';
+ document.getElementById('multinode-overhead-reducescatter').textContent = '-- GB';
+ document.getElementById('multinode-overhead-pipeline').textContent = '-- GB';
+ document.getElementById('multinode-time-overhead').textContent = '-- ms/step';
+ document.getElementById('multinode-comm-time').textContent = '-- ms/step';
+ document.getElementById('multinode-latency').textContent = '-- ms';
+ document.getElementById('multinode-suggestions').innerHTML = 'Run calculation to see optimization suggestions.
';
+ }
+
+ applyInferenceConfig(config) {
+ // Apply model configuration to inference form
+ if (config.model) {
+ if (config.model.name) {
+ document.getElementById('inference-model-name').value = config.model.name;
+ }
+ if (config.model.num_parameters) {
+ document.getElementById('inference-num-params').value = config.model.num_parameters;
+ }
+ if (config.model.num_layers) {
+ document.getElementById('inference-num-layers').value = config.model.num_layers;
+ }
+ if (config.model.hidden_size) {
+ document.getElementById('inference-hidden-size').value = config.model.hidden_size;
+ }
+ if (config.model.num_attention_heads) {
+ document.getElementById('inference-num-heads').value = config.model.num_attention_heads;
+ }
+ if (config.model.vocab_size) {
+ document.getElementById('inference-vocab-size').value = config.model.vocab_size;
+ }
+ if (config.model.max_seq_len) {
+ document.getElementById('inference-seq-len').value = config.model.max_seq_len;
+ }
+ }
+ }
+
+ applyMultiNodeConfig(config) {
+ // Apply model configuration to multinode form
+ if (config.model) {
+ if (config.model.num_parameters) {
+ document.getElementById('multinode-num-params').value = config.model.num_parameters;
+ }
+ }
+ }
+
+ /**
+ * Show export framework modal
+ */
+ showExportModal() {
+ const format = prompt('Select export format:\n1 - Accelerate\n2 - Lightning\n3 - Axolotl\n4 - DeepSpeed\n5 - YAML\n6 - JSON\n\nEnter number (1-6):');
+
+ if (!format) return;
+
+ const formatMap = {
+ '1': 'accelerate',
+ '2': 'lightning',
+ '3': 'axolotl',
+ '4': 'deepspeed',
+ '5': 'yaml',
+ '6': 'json',
+ };
+
+ const selectedFormat = formatMap[format];
+ if (!selectedFormat) {
+ this.showError('Invalid format selected');
+ return;
+ }
+
+ this.exportFrameworkConfig(selectedFormat);
+ }
+
+ async exportFrameworkConfig(format) {
+ try {
+ const config = this.collectFormData();
+ const response = await fetch(`${this.apiBase}/export/${format}`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(config),
+ });
+
+ if (!response.ok) {
+ throw new Error(`Failed to export ${format} config`);
+ }
+
+ const result = await response.json();
+ this.downloadConfig(result, format);
+ } catch (error) {
+ this.showError(`Error: ${error.message}`);
+ }
+ }
+}
+
+// Initialize the calculator when DOM is ready
+document.addEventListener('DOMContentLoaded', () => {
+ new GPUMemCalculator();
+});
diff --git a/web/templates/index.html b/web/templates/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..89da3e8b9904998ef0417d291a99793eb9c34deb
--- /dev/null
+++ b/web/templates/index.html
@@ -0,0 +1,999 @@
+
+
+
+
+
+ GPU Memory Calculator for LLM Training
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Training Configuration
+
+
+
+ Model Settings
+
+
+
+
+
+
+
+
+
+
+ Mixture of Experts (MoE)
+
+
+
+
+
+
+
With MoE, only 2 of 8 experts are active per token, reducing activation memory.
+
+
+
+
+
+
+
+
+ Parallelism
+
+ Effective GPUs: 8
+
+
+
+
+ Training Engine
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Results
+
+
+
Memory Breakdown
+
+ Per GPU:
+ -- GB
+
+
+ Total All GPUs:
+ -- GB
+
+
+ CPU Memory:
+ -- GB
+
+
+
+
+
Component Breakdown
+
+ Model Parameters:
+ -- GB
+
+
+ Gradients:
+ -- GB
+
+
+ Optimizer States:
+ -- GB
+
+
+ Activations:
+ -- GB
+
+
+ Overhead:
+ -- GB
+
+
+
+
+
+ Params
+ Grads
+ Opt
+ Act
+
+
+
+
+
Feasibility
+
+ Status:
+ --
+
+
+ Utilization:
+ --%
+
+
+ Recommended Batch:
+ --
+
+
+
+
+
Formula Explanation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Inference Configuration
+
+
+
+ Model Settings
+
+
+
+
+
+
+
+
+
+
+
+
+
+ TGI-Specific Settings
+
+
+
+
+
+ vLLM-Specific Settings
+
+
+
+
+
+ TensorRT-LLM-Specific Settings
+
+
+
+
+
+ SGLang-Specific Settings
+
+
+
+
+
+
+
+
+
+
+
+
+
Inference Results
+
+
+
Memory Breakdown
+
+ Per GPU:
+ -- GB
+
+
+ Total All GPUs:
+ -- GB
+
+
+ Model Parameters:
+ -- GB
+
+
+ KV Cache:
+ -- GB
+
+
+ Activations:
+ -- GB
+
+
+
+
+
Performance Estimates
+
+ Max Batch Size:
+ --
+
+
+ Estimated Throughput:
+ -- tokens/sec
+
+
+ Fits on GPU:
+ --
+
+
+ Utilization:
+ --%
+
+
+
+
+
+
+
+
+
Multi-Node Training Configuration
+
Calculate network communication overhead for distributed training across multiple nodes.
+
+
+
+ Model Settings
+
+
+
+
+
+
+
+
+
+
+ Node Configuration
+
+ Total GPUs: 16
+
+
+
+
+ Training Configuration
+
+
+
+
+
+ Parallelism Strategy
+
+
+
+
+
+
+
+
+
+
+
+
+
Multi-Node Results
+
+
+
Network Overhead
+
+ Total Overhead:
+ -- GB
+
+
+ AllReduce:
+ -- GB
+
+
+ AllGather:
+ -- GB
+
+
+ ReduceScatter:
+ -- GB
+
+
+ Pipeline Comm:
+ -- GB
+
+
+
+
+
Time Impact
+
+ Est. Overhead:
+ -- ms/step
+
+
+ Communication Time:
+ -- ms/step
+
+
+ Latency Impact:
+ -- ms
+
+
+
+
+
Optimization Suggestions
+
+
Run calculation to see optimization suggestions.
+
+
+
+
+
+
+
+
+
+
+
+
+