Spaces:
Sleeping
Sleeping
George Yang commited on
Commit ·
36ed1cd
1
Parent(s): 8e7e10d
Initial deployment: Add GPU Memory Calculator with Docker
Browse files- Add FastAPI web application
- Add all calculator modules (training, inference, multi-node)
- Configure Docker for Python 3.12
- Add requirements.txt with web dependencies
- Add Space README with metadata
This view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +92 -0
- Dockerfile +40 -0
- README.md +57 -7
- cli/main.py +399 -0
- requirements.txt +12 -0
- src/gpu_mem_calculator.egg-info/PKG-INFO +720 -0
- src/gpu_mem_calculator.egg-info/SOURCES.txt +46 -0
- src/gpu_mem_calculator.egg-info/dependency_links.txt +1 -0
- src/gpu_mem_calculator.egg-info/entry_points.txt +2 -0
- src/gpu_mem_calculator.egg-info/requires.txt +16 -0
- src/gpu_mem_calculator.egg-info/top_level.txt +1 -0
- src/gpu_mem_calculator/__init__.py +3 -0
- src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/cli/__init__.py +5 -0
- src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/cli/main.py +399 -0
- src/gpu_mem_calculator/config/__init__.py +5 -0
- src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/config/parser.py +323 -0
- src/gpu_mem_calculator/config/presets.py +83 -0
- src/gpu_mem_calculator/core/__init__.py +24 -0
- src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/core/calculator.py +178 -0
- src/gpu_mem_calculator/core/formulas.py +268 -0
- src/gpu_mem_calculator/core/models.py +568 -0
- src/gpu_mem_calculator/core/multinode.py +308 -0
- src/gpu_mem_calculator/engines/__init__.py +16 -0
- src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/engines/base.py +220 -0
- src/gpu_mem_calculator/engines/deepspeed.py +316 -0
- src/gpu_mem_calculator/engines/fsdp.py +213 -0
- src/gpu_mem_calculator/engines/megatron.py +257 -0
- src/gpu_mem_calculator/engines/pytorch.py +88 -0
- src/gpu_mem_calculator/exporters/__init__.py +14 -0
- src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc +0 -0
- src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
.github
|
| 5 |
+
|
| 6 |
+
# Docker
|
| 7 |
+
Dockerfile
|
| 8 |
+
.dockerignore
|
| 9 |
+
|
| 10 |
+
# Python
|
| 11 |
+
__pycache__
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
*.so
|
| 15 |
+
.Python
|
| 16 |
+
build/
|
| 17 |
+
develop-eggs/
|
| 18 |
+
dist/
|
| 19 |
+
downloads/
|
| 20 |
+
eggs/
|
| 21 |
+
.eggs/
|
| 22 |
+
lib/
|
| 23 |
+
lib64/
|
| 24 |
+
parts/
|
| 25 |
+
sdist/
|
| 26 |
+
wheels/
|
| 27 |
+
*.egg-info/
|
| 28 |
+
.installed.cfg
|
| 29 |
+
*.egg
|
| 30 |
+
|
| 31 |
+
# Virtual environments
|
| 32 |
+
venv/
|
| 33 |
+
env/
|
| 34 |
+
ENV/
|
| 35 |
+
.venv/
|
| 36 |
+
.env
|
| 37 |
+
|
| 38 |
+
# Testing
|
| 39 |
+
.pytest_cache/
|
| 40 |
+
.coverage
|
| 41 |
+
coverage.xml
|
| 42 |
+
htmlcov/
|
| 43 |
+
.tox/
|
| 44 |
+
.mypy_cache/
|
| 45 |
+
.ruff_cache/
|
| 46 |
+
|
| 47 |
+
# IDEs
|
| 48 |
+
.vscode/
|
| 49 |
+
.idea/
|
| 50 |
+
*.swp
|
| 51 |
+
*.swo
|
| 52 |
+
*~
|
| 53 |
+
|
| 54 |
+
# OS
|
| 55 |
+
.DS_Store
|
| 56 |
+
Thumbs.db
|
| 57 |
+
|
| 58 |
+
# Claude
|
| 59 |
+
.claude/
|
| 60 |
+
.mcp.json
|
| 61 |
+
|
| 62 |
+
# Documentation (source files included, but skip extras)
|
| 63 |
+
docs/
|
| 64 |
+
*.md
|
| 65 |
+
!README.md
|
| 66 |
+
|
| 67 |
+
# Project specific
|
| 68 |
+
*.log
|
| 69 |
+
.env
|
| 70 |
+
.venv/
|
| 71 |
+
|
| 72 |
+
# CI/CD
|
| 73 |
+
CODE_OF_CONDUCT.md
|
| 74 |
+
CONTRIBUTING.md
|
| 75 |
+
MARKETING.md
|
| 76 |
+
SECURITY.md
|
| 77 |
+
CHANGELOG.md
|
| 78 |
+
|
| 79 |
+
# Screenshots and images
|
| 80 |
+
*.png
|
| 81 |
+
*.jpg
|
| 82 |
+
*.jpeg
|
| 83 |
+
*.gif
|
| 84 |
+
!screenshot.png
|
| 85 |
+
|
| 86 |
+
# Test files
|
| 87 |
+
tests/
|
| 88 |
+
examples/
|
| 89 |
+
configs/
|
| 90 |
+
|
| 91 |
+
# MCP server config
|
| 92 |
+
.mcp.json
|
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile for Hugging Face Spaces
|
| 2 |
+
# GPU Memory Calculator - FastAPI Web Application
|
| 3 |
+
|
| 4 |
+
FROM python:3.12-slim
|
| 5 |
+
|
| 6 |
+
# Set working directory
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Set environment variables
|
| 10 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 11 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 12 |
+
PORT=7860
|
| 13 |
+
|
| 14 |
+
# Install system dependencies
|
| 15 |
+
RUN apt-get update && \
|
| 16 |
+
apt-get install -y --no-install-recommends \
|
| 17 |
+
gcc \
|
| 18 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 19 |
+
|
| 20 |
+
# Copy requirements first for better Docker layer caching
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
+
|
| 23 |
+
# Install Python dependencies
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# Copy project files
|
| 27 |
+
COPY . .
|
| 28 |
+
|
| 29 |
+
# Install the package in editable mode
|
| 30 |
+
RUN pip install --no-cache-dir -e .
|
| 31 |
+
|
| 32 |
+
# Expose Hugging Face Spaces default port
|
| 33 |
+
EXPOSE 7860
|
| 34 |
+
|
| 35 |
+
# Health check endpoint
|
| 36 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 37 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/').read()"
|
| 38 |
+
|
| 39 |
+
# Run the FastAPI application with uvicorn
|
| 40 |
+
CMD ["uvicorn", "web.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,62 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
license:
|
| 9 |
-
short_description: Calculates GPU memory for training, inference, and more
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: GPU Memory Calculator
|
| 3 |
+
emoji: 🎮
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# GPU Memory Calculator
|
| 12 |
+
|
| 13 |
+
Calculate GPU memory requirements for training and running Large Language Models (LLMs). Supports multiple training engines (PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, FSDP), inference engines (HuggingFace, vLLM, TGI, TensorRT-LLM, SGLang), and multi-node training configurations.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- **Training Memory Calculation**: Calculate memory for PyTorch DDP, DeepSpeed ZeRO (0-3), Megatron-LM, FSDP, and hybrid approaches
|
| 18 |
+
- **Inference Memory Calculation**: Estimate memory requirements for HuggingFace Transformers, vLLM, TGI, TensorRT-LLM, and SGLang
|
| 19 |
+
- **Multi-Node Support**: Calculate network overhead for distributed training across multiple nodes
|
| 20 |
+
- **Model Presets**: Pre-configured settings for popular models (LLaMA 2, GPT-3, Mixtral, GLM, Qwen, DeepSeek-MoE)
|
| 21 |
+
- **Configuration Export**: Generate configs for Accelerate, Lightning, Axolotl, DeepSpeed, YAML, and JSON
|
| 22 |
+
- **Batch Size Optimization**: Automatically find the maximum batch size that fits in GPU memory
|
| 23 |
+
|
| 24 |
+
## Supported Training Engines
|
| 25 |
+
|
| 26 |
+
- PyTorch DDP (Distributed Data Parallel)
|
| 27 |
+
- DeepSpeed ZeRO (Stages 0-3) with CPU/NVMe offloading
|
| 28 |
+
- Megatron-LM (Tensor + Pipeline Parallelism)
|
| 29 |
+
- PyTorch FSDP (Fully Sharded Data Parallel)
|
| 30 |
+
- Megatron-LM + DeepSpeed (Hybrid)
|
| 31 |
+
|
| 32 |
+
## Supported Inference Engines
|
| 33 |
+
|
| 34 |
+
- HuggingFace Transformers
|
| 35 |
+
- vLLM (PagedAttention)
|
| 36 |
+
- Text Generation Inference (TGI)
|
| 37 |
+
- TensorRT-LLM
|
| 38 |
+
- SGLang (RadixAttention)
|
| 39 |
+
|
| 40 |
+
## How to Use
|
| 41 |
+
|
| 42 |
+
1. **Select a preset model** or configure your own
|
| 43 |
+
2. **Choose training/inference engine** and adjust parameters
|
| 44 |
+
3. **Calculate** memory requirements instantly
|
| 45 |
+
4. **Export** configurations to your preferred framework
|
| 46 |
+
|
| 47 |
+
## Example Use Cases
|
| 48 |
+
|
| 49 |
+
- Planning GPU requirements for LLM training
|
| 50 |
+
- Optimizing batch sizes for your hardware
|
| 51 |
+
- Comparing memory efficiency across engines
|
| 52 |
+
- Estimating KV cache memory for inference
|
| 53 |
+
- Calculating multi-node network overhead
|
| 54 |
+
|
| 55 |
+
## Links
|
| 56 |
+
|
| 57 |
+
- [GitHub Repository](https://github.com/George614/gpu-mem-calculator)
|
| 58 |
+
- [Documentation](https://github.com/George614/gpu-mem-calculator/blob/main/README.md)
|
| 59 |
+
|
| 60 |
+
## License
|
| 61 |
+
|
| 62 |
+
MIT License - see [LICENSE](https://github.com/George614/gpu-mem-calculator/blob/main/LICENSE) for details.
|
cli/main.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI interface for GPU Memory Calculator."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import TYPE_CHECKING, Literal
|
| 7 |
+
|
| 8 |
+
import click
|
| 9 |
+
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 12 |
+
from gpu_mem_calculator.core.models import MemoryResult
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@click.group()
|
| 16 |
+
@click.version_option(version="0.1.0")
|
| 17 |
+
def main() -> None:
|
| 18 |
+
"""GPU Memory Calculator for LLM Training.
|
| 19 |
+
|
| 20 |
+
Calculate GPU memory requirements for training Large Language Models
|
| 21 |
+
with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
|
| 22 |
+
"""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@main.command()
|
| 27 |
+
@click.option(
|
| 28 |
+
"--config",
|
| 29 |
+
"-c",
|
| 30 |
+
type=click.Path(exists=True),
|
| 31 |
+
help="Path to JSON configuration file",
|
| 32 |
+
)
|
| 33 |
+
@click.option(
|
| 34 |
+
"--preset",
|
| 35 |
+
"-p",
|
| 36 |
+
type=str,
|
| 37 |
+
help="Name of a preset model configuration",
|
| 38 |
+
)
|
| 39 |
+
@click.option(
|
| 40 |
+
"--output",
|
| 41 |
+
"-o",
|
| 42 |
+
type=click.Path(),
|
| 43 |
+
help="Output file path (default: stdout)",
|
| 44 |
+
)
|
| 45 |
+
@click.option(
|
| 46 |
+
"--format",
|
| 47 |
+
"-f",
|
| 48 |
+
type=click.Choice(["json", "yaml", "table"]),
|
| 49 |
+
default="table",
|
| 50 |
+
help="Output format (default: table)",
|
| 51 |
+
)
|
| 52 |
+
def calculate(
|
| 53 |
+
config: str | None,
|
| 54 |
+
preset: str | None,
|
| 55 |
+
output: str | None,
|
| 56 |
+
format: Literal["json", "yaml", "table"],
|
| 57 |
+
) -> None:
|
| 58 |
+
"""Calculate GPU memory requirements from config file or preset.
|
| 59 |
+
|
| 60 |
+
Examples:
|
| 61 |
+
gpu-mem-calc calculate --config configs/llama2_7b.json
|
| 62 |
+
gpu-mem-calc calculate --preset llama2-7b
|
| 63 |
+
gpu-mem-calc calculate -p mixtral-8x7b --format json
|
| 64 |
+
"""
|
| 65 |
+
if not config and not preset:
|
| 66 |
+
click.echo("Error: Either --config or --preset is required", err=True)
|
| 67 |
+
sys.exit(1)
|
| 68 |
+
|
| 69 |
+
if config and preset:
|
| 70 |
+
click.echo("Error: Cannot use both --config and --preset", err=True)
|
| 71 |
+
sys.exit(1)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
import tempfile
|
| 75 |
+
|
| 76 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 77 |
+
|
| 78 |
+
if preset:
|
| 79 |
+
# Load preset configuration
|
| 80 |
+
from gpu_mem_calculator.config.presets import get_preset_config
|
| 81 |
+
|
| 82 |
+
preset_config = get_preset_config(preset)
|
| 83 |
+
if preset_config is None:
|
| 84 |
+
click.echo(
|
| 85 |
+
f"Error: Preset '{preset}' not found. "
|
| 86 |
+
"Use 'gpu-mem-calc presets' to list available presets.",
|
| 87 |
+
err=True,
|
| 88 |
+
)
|
| 89 |
+
sys.exit(1)
|
| 90 |
+
|
| 91 |
+
# Write preset to temp file for from_config_file
|
| 92 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
| 93 |
+
json.dump(preset_config, f, indent=2)
|
| 94 |
+
temp_path = f.name
|
| 95 |
+
|
| 96 |
+
calculator = GPUMemoryCalculator.from_config_file(temp_path)
|
| 97 |
+
Path(temp_path).unlink() # Clean up temp file
|
| 98 |
+
elif config:
|
| 99 |
+
calculator = GPUMemoryCalculator.from_config_file(config)
|
| 100 |
+
else:
|
| 101 |
+
# This should never happen due to the checks above
|
| 102 |
+
click.echo("Error: Either --config or --preset is required", err=True)
|
| 103 |
+
sys.exit(1)
|
| 104 |
+
|
| 105 |
+
result = calculator.calculate()
|
| 106 |
+
|
| 107 |
+
# Format output
|
| 108 |
+
if format == "json":
|
| 109 |
+
output_text = json.dumps(result.model_dump(mode="json"), indent=2)
|
| 110 |
+
elif format == "yaml":
|
| 111 |
+
try:
|
| 112 |
+
import yaml # type: ignore[import-untyped]
|
| 113 |
+
|
| 114 |
+
output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
|
| 115 |
+
except ImportError:
|
| 116 |
+
click.echo(
|
| 117 |
+
"Error: YAML format requires PyYAML. Install with: pip install pyyaml",
|
| 118 |
+
err=True,
|
| 119 |
+
)
|
| 120 |
+
sys.exit(1)
|
| 121 |
+
else: # table
|
| 122 |
+
output_text = _format_result_as_table(result, calculator)
|
| 123 |
+
|
| 124 |
+
# Write output
|
| 125 |
+
if output:
|
| 126 |
+
Path(output).write_text(output_text)
|
| 127 |
+
click.echo(f"Results written to {output}")
|
| 128 |
+
else:
|
| 129 |
+
click.echo(output_text)
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
click.echo(f"Error: {e}", err=True)
|
| 133 |
+
sys.exit(1)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@main.command()
|
| 137 |
+
@click.argument(
|
| 138 |
+
"params",
|
| 139 |
+
type=float,
|
| 140 |
+
required=True,
|
| 141 |
+
)
|
| 142 |
+
@click.option(
|
| 143 |
+
"--gpus",
|
| 144 |
+
"-g",
|
| 145 |
+
type=int,
|
| 146 |
+
default=1,
|
| 147 |
+
help="Number of GPUs (default: 1)",
|
| 148 |
+
)
|
| 149 |
+
@click.option(
|
| 150 |
+
"--gpu-mem",
|
| 151 |
+
"-m",
|
| 152 |
+
type=float,
|
| 153 |
+
default=80.0,
|
| 154 |
+
help="GPU memory in GB (default: 80.0)",
|
| 155 |
+
)
|
| 156 |
+
@click.option(
|
| 157 |
+
"--engine",
|
| 158 |
+
"-e",
|
| 159 |
+
type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
|
| 160 |
+
default="pytorch",
|
| 161 |
+
help="Training engine (default: pytorch)",
|
| 162 |
+
)
|
| 163 |
+
@click.option(
|
| 164 |
+
"--dtype",
|
| 165 |
+
"-d",
|
| 166 |
+
type=click.Choice(["fp32", "fp16", "bf16"]),
|
| 167 |
+
default="bf16",
|
| 168 |
+
help="Data type (default: bf16)",
|
| 169 |
+
)
|
| 170 |
+
def quick(
|
| 171 |
+
params: float,
|
| 172 |
+
gpus: int,
|
| 173 |
+
gpu_mem: float,
|
| 174 |
+
engine: str,
|
| 175 |
+
dtype: str,
|
| 176 |
+
) -> None:
|
| 177 |
+
"""Quick calculation from model size (in billions of parameters).
|
| 178 |
+
|
| 179 |
+
Example:
|
| 180 |
+
gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
|
| 181 |
+
"""
|
| 182 |
+
try:
|
| 183 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 184 |
+
from gpu_mem_calculator.core.models import (
|
| 185 |
+
DType,
|
| 186 |
+
EngineConfig,
|
| 187 |
+
EngineType,
|
| 188 |
+
GPUConfig,
|
| 189 |
+
ModelConfig,
|
| 190 |
+
ParallelismConfig,
|
| 191 |
+
TrainingConfig,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Map engine string to EngineType
|
| 195 |
+
engine_map = {
|
| 196 |
+
"pytorch": EngineType.PYTORCH_DDP,
|
| 197 |
+
"deepspeed": EngineType.DEEPSPEED,
|
| 198 |
+
"megatron": EngineType.MEGATRON_LM,
|
| 199 |
+
"fsdp": EngineType.FSDP,
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Map dtype string to DType
|
| 203 |
+
dtype_map = {
|
| 204 |
+
"fp32": DType.FP32,
|
| 205 |
+
"fp16": DType.FP16,
|
| 206 |
+
"bf16": DType.BF16,
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
# Create a minimal config for quick calculation
|
| 210 |
+
# Estimate model architecture from parameter count
|
| 211 |
+
# Rough approximation based on typical transformer models
|
| 212 |
+
num_params = int(params * 1e9)
|
| 213 |
+
|
| 214 |
+
# Estimate hidden size and layers from param count
|
| 215 |
+
# These are rough approximations
|
| 216 |
+
if params <= 1:
|
| 217 |
+
hidden_size, num_layers = 768, 12
|
| 218 |
+
elif params <= 7:
|
| 219 |
+
hidden_size, num_layers = 4096, 32
|
| 220 |
+
elif params <= 13:
|
| 221 |
+
hidden_size, num_layers = 5120, 40
|
| 222 |
+
elif params <= 30:
|
| 223 |
+
hidden_size, num_layers = 6656, 60
|
| 224 |
+
elif params <= 65:
|
| 225 |
+
hidden_size, num_layers = 8192, 80
|
| 226 |
+
else:
|
| 227 |
+
hidden_size, num_layers = 12288, 96
|
| 228 |
+
|
| 229 |
+
model_config = ModelConfig(
|
| 230 |
+
name="quick-estimate",
|
| 231 |
+
num_parameters=num_params,
|
| 232 |
+
num_layers=num_layers,
|
| 233 |
+
hidden_size=hidden_size,
|
| 234 |
+
num_attention_heads=hidden_size // 128,
|
| 235 |
+
vocab_size=32000,
|
| 236 |
+
max_seq_len=2048,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
training_config = TrainingConfig(
|
| 240 |
+
batch_size=1,
|
| 241 |
+
gradient_accumulation_steps=1,
|
| 242 |
+
dtype=dtype_map[dtype],
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
parallelism_config = ParallelismConfig(data_parallel_size=gpus)
|
| 246 |
+
|
| 247 |
+
engine_config = EngineConfig(
|
| 248 |
+
type=engine_map[engine],
|
| 249 |
+
zero_stage=2 if engine == "deepspeed" else None,
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
|
| 253 |
+
|
| 254 |
+
calculator = GPUMemoryCalculator(
|
| 255 |
+
model_config=model_config,
|
| 256 |
+
training_config=training_config,
|
| 257 |
+
parallelism_config=parallelism_config,
|
| 258 |
+
engine_config=engine_config,
|
| 259 |
+
gpu_config=gpu_config,
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
result = calculator.calculate()
|
| 263 |
+
|
| 264 |
+
# Display results
|
| 265 |
+
click.echo(_format_result_as_table(result, calculator))
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
click.echo(f"Error: {e}", err=True)
|
| 269 |
+
sys.exit(1)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
@main.command()
|
| 273 |
+
@click.argument(
|
| 274 |
+
"config_path",
|
| 275 |
+
type=click.Path(exists=True),
|
| 276 |
+
)
|
| 277 |
+
def validate(config_path: str) -> None:
|
| 278 |
+
"""Validate a configuration file.
|
| 279 |
+
|
| 280 |
+
Example:
|
| 281 |
+
gpu-mem-calc validate configs/my_config.json
|
| 282 |
+
"""
|
| 283 |
+
try:
|
| 284 |
+
from gpu_mem_calculator.config import ConfigParser
|
| 285 |
+
|
| 286 |
+
ConfigParser.parse_full_config(config_path)
|
| 287 |
+
click.echo(f"✓ Configuration file '{config_path}' is valid")
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
click.echo(f"✗ Validation failed: {e}", err=True)
|
| 291 |
+
sys.exit(1)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
@main.command()
|
| 295 |
+
@click.option(
|
| 296 |
+
"--format",
|
| 297 |
+
"-f",
|
| 298 |
+
type=click.Choice(["list", "json", "table"]),
|
| 299 |
+
default="list",
|
| 300 |
+
help="Output format (default: list)",
|
| 301 |
+
)
|
| 302 |
+
def presets(format: str) -> None:
|
| 303 |
+
"""List available model preset configurations.
|
| 304 |
+
|
| 305 |
+
Examples:
|
| 306 |
+
gpu-mem-calc presets
|
| 307 |
+
gpu-mem-calc presets --format table
|
| 308 |
+
gpu-mem-calc presets -f json
|
| 309 |
+
"""
|
| 310 |
+
try:
|
| 311 |
+
from gpu_mem_calculator.config.presets import list_presets
|
| 312 |
+
|
| 313 |
+
all_presets = list_presets()
|
| 314 |
+
|
| 315 |
+
if not all_presets:
|
| 316 |
+
click.echo("No presets found.")
|
| 317 |
+
return
|
| 318 |
+
|
| 319 |
+
if format == "json":
|
| 320 |
+
click.echo(json.dumps(all_presets, indent=2))
|
| 321 |
+
elif format == "table":
|
| 322 |
+
from rich.console import Console
|
| 323 |
+
from rich.table import Table
|
| 324 |
+
|
| 325 |
+
console = Console()
|
| 326 |
+
table = Table(
|
| 327 |
+
title="Available Model Presets",
|
| 328 |
+
show_header=True,
|
| 329 |
+
header_style="bold magenta",
|
| 330 |
+
)
|
| 331 |
+
table.add_column("Preset Name", style="cyan", width=25)
|
| 332 |
+
table.add_column("Display Name", style="green", width=30)
|
| 333 |
+
table.add_column("Description", style="yellow")
|
| 334 |
+
|
| 335 |
+
for name, info in sorted(all_presets.items()):
|
| 336 |
+
table.add_row(name, info["display_name"], info["description"])
|
| 337 |
+
|
| 338 |
+
console.print(table)
|
| 339 |
+
else: # list format
|
| 340 |
+
click.echo("Available model presets:\n")
|
| 341 |
+
for name, info in sorted(all_presets.items()): # type: ignore[annotation-unchecked]
|
| 342 |
+
click.echo(f" {name:25} - {info['display_name']}")
|
| 343 |
+
if info.get("description"):
|
| 344 |
+
click.echo(f"{'':27}{info['description']}")
|
| 345 |
+
click.echo()
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
click.echo(f"Error: {e}", err=True)
|
| 349 |
+
sys.exit(1)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
|
| 353 |
+
"""Format result as ASCII table."""
|
| 354 |
+
from rich.console import Console
|
| 355 |
+
from rich.table import Table
|
| 356 |
+
|
| 357 |
+
console = Console()
|
| 358 |
+
|
| 359 |
+
# Main results table
|
| 360 |
+
table = Table(
|
| 361 |
+
title="GPU Memory Calculation Results",
|
| 362 |
+
show_header=True,
|
| 363 |
+
header_style="bold magenta",
|
| 364 |
+
)
|
| 365 |
+
table.add_column("Metric", style="cyan", width=30)
|
| 366 |
+
table.add_column("Value", style="green")
|
| 367 |
+
|
| 368 |
+
# Memory results
|
| 369 |
+
table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
|
| 370 |
+
table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
|
| 371 |
+
table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
|
| 372 |
+
table.add_row("", "") # Spacer
|
| 373 |
+
|
| 374 |
+
# Breakdown
|
| 375 |
+
table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
|
| 376 |
+
table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
|
| 377 |
+
table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
|
| 378 |
+
table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
|
| 379 |
+
table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
|
| 380 |
+
table.add_row("", "") # Spacer
|
| 381 |
+
|
| 382 |
+
# Feasibility
|
| 383 |
+
status = "✓ Fits" if result.fits_on_gpu else "✗ OOM"
|
| 384 |
+
table.add_row("Status", status)
|
| 385 |
+
table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
|
| 386 |
+
if result.recommended_batch_size:
|
| 387 |
+
table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
|
| 388 |
+
|
| 389 |
+
# Capture table output
|
| 390 |
+
from io import StringIO
|
| 391 |
+
|
| 392 |
+
buffer = StringIO()
|
| 393 |
+
console.file = buffer
|
| 394 |
+
console.print(table)
|
| 395 |
+
return buffer.getvalue()
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
if __name__ == "__main__":
|
| 399 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GPU Memory Calculator - Requirements for Hugging Face Spaces
|
| 2 |
+
|
| 3 |
+
# Core dependencies
|
| 4 |
+
pydantic>=2.0.0
|
| 5 |
+
click>=8.1.0
|
| 6 |
+
pydantic-settings>=2.0.0
|
| 7 |
+
rich>=13.0.0
|
| 8 |
+
|
| 9 |
+
# Web dependencies
|
| 10 |
+
fastapi>=0.100.0
|
| 11 |
+
uvicorn[standard]>=0.23.0
|
| 12 |
+
jinja2>=3.1.0
|
src/gpu_mem_calculator.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: gpu-mem-calculator
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: GPU Memory Calculator for LLM Training
|
| 5 |
+
Author: GPU Mem Calculator Team
|
| 6 |
+
License: MIT
|
| 7 |
+
Project-URL: Homepage, https://github.com/George614/gpu-mem-calculator
|
| 8 |
+
Project-URL: Repository, https://github.com/George614/gpu-mem-calculator
|
| 9 |
+
Project-URL: Issues, https://github.com/George614/gpu-mem-calculator/issues
|
| 10 |
+
Keywords: gpu,memory,calculator,llm,large-language-model,training,deepspeed,megatron,pytorch,fsdp,transformer,machine-learning,deep-learning,distributed-training,zero-optimization
|
| 11 |
+
Classifier: Development Status :: 3 - Alpha
|
| 12 |
+
Classifier: Intended Audience :: Developers
|
| 13 |
+
Classifier: Intended Audience :: Science/Research
|
| 14 |
+
Classifier: License :: OSI Approved :: MIT License
|
| 15 |
+
Classifier: Programming Language :: Python :: 3
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 19 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 20 |
+
Requires-Python: >=3.10
|
| 21 |
+
Description-Content-Type: text/markdown
|
| 22 |
+
License-File: LICENSE
|
| 23 |
+
Requires-Dist: pydantic>=2.0.0
|
| 24 |
+
Requires-Dist: click>=8.1.0
|
| 25 |
+
Requires-Dist: pydantic-settings>=2.0.0
|
| 26 |
+
Requires-Dist: rich>=13.0.0
|
| 27 |
+
Provides-Extra: web
|
| 28 |
+
Requires-Dist: fastapi>=0.100.0; extra == "web"
|
| 29 |
+
Requires-Dist: uvicorn[standard]>=0.23.0; extra == "web"
|
| 30 |
+
Requires-Dist: jinja2>=3.1.0; extra == "web"
|
| 31 |
+
Provides-Extra: dev
|
| 32 |
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
| 33 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
| 34 |
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
| 35 |
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
| 36 |
+
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
| 37 |
+
Dynamic: license-file
|
| 38 |
+
|
| 39 |
+
# GPU Memory Calculator for LLM Training
|
| 40 |
+
|
| 41 |
+
[](https://opensource.org/licenses/MIT)
|
| 42 |
+
[](https://www.python.org/downloads/)
|
| 43 |
+
[](https://github.com/psf/black)
|
| 44 |
+
[](CONTRIBUTING.md)
|
| 45 |
+
|
| 46 |
+
A versatile Python application for calculating GPU memory requirements for training Large Language Models with support for multiple training engines including PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, and FSDP.
|
| 47 |
+
|
| 48 |
+
📖 **[Getting Started Guide](docs/GETTING_STARTED.md)** | 💬 **[FAQ](docs/FAQ.md)** | 🤝 **[Contributing](CONTRIBUTING.md)**
|
| 49 |
+
|
| 50 |
+
<p align="center">
|
| 51 |
+
<img src="screenshot.png" alt="GPU Memory Calculator Screenshot" width="800">
|
| 52 |
+
</p>
|
| 53 |
+
|
| 54 |
+
## 🚀 Why Use This Tool?
|
| 55 |
+
|
| 56 |
+
Training large language models requires careful memory planning. This calculator helps you:
|
| 57 |
+
|
| 58 |
+
- **💰 Save costs** by determining the optimal GPU configuration before you start training
|
| 59 |
+
- **⚡ Avoid OOM errors** by validating your training configuration fits in GPU memory
|
| 60 |
+
- **📊 Compare strategies** across different training engines (DeepSpeed, Megatron, FSDP)
|
| 61 |
+
- **🎯 Plan infrastructure** by knowing exactly how many GPUs you need
|
| 62 |
+
- **📈 Scale efficiently** with detailed memory breakdowns for optimization
|
| 63 |
+
|
| 64 |
+
Whether you're training a 7B parameter model on a single GPU or a 175B model across hundreds of GPUs, this tool provides accurate memory estimates based on proven formulas from DeepSpeed, Megatron-LM, and the latest research.
|
| 65 |
+
|
| 66 |
+
## ✨ Features
|
| 67 |
+
|
| 68 |
+
### Core Training Calculation
|
| 69 |
+
- 🔧 **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 1-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
|
| 70 |
+
- 🖥️ **Dual Interface**: Both CLI and Web UI for flexible usage
|
| 71 |
+
- 🎯 **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, etc.)
|
| 72 |
+
- 📊 **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
|
| 73 |
+
- ✅ **Feasibility Analysis**: Check if your configuration fits on available GPU memory
|
| 74 |
+
- ⚙️ **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
|
| 75 |
+
|
| 76 |
+
### 🆕 Inference Memory Calculation
|
| 77 |
+
- 🚀 **Multi-Engine Support**: HuggingFace Transformers, vLLM, TGI, TensorRT-LLM
|
| 78 |
+
- 💾 **KV Cache Optimization**: Quantization options (NONE, INT8, FP8, INT4)
|
| 79 |
+
- 🔄 **Tensor Parallelism**: Automatic memory distribution across GPUs
|
| 80 |
+
- 📈 **Throughput Estimation**: Tokens/second estimates for capacity planning
|
| 81 |
+
- 🎯 **Batch Size Optimization**: Find maximum batch size for your hardware
|
| 82 |
+
|
| 83 |
+
### 🆕 Multi-Node Training
|
| 84 |
+
- 🌐 **Network Overhead Calculation**: AllReduce, AllGather, ReduceScatter, pipeline communication
|
| 85 |
+
- 📡 **Interconnect Support**: InfiniBand, NVLink, Ethernet (10G/25G/100G/200G)
|
| 86 |
+
- ⚡ **Hybrid Parallelism Optimization**: Automatic TP+PP+DP strategy optimization
|
| 87 |
+
- 🔧 **ZeRO Stage Impact Analysis**: Compare communication overhead across ZeRO stages
|
| 88 |
+
|
| 89 |
+
### 🆕 Framework Configuration Exporters
|
| 90 |
+
- 📦 **Accelerate Export**: HuggingFace Accelerate config generation
|
| 91 |
+
- ⚡ **Lightning Export**: PyTorch Lightning Trainer configuration
|
| 92 |
+
- 🔥 **Axolotl Export**: YAML config for fine-tuning
|
| 93 |
+
- 📄 **File Export**: Save to YAML/JSON formats
|
| 94 |
+
- 🎛️ **Format Conversion**: Convert between different framework configs
|
| 95 |
+
|
| 96 |
+
## 📦 Installation
|
| 97 |
+
|
| 98 |
+
### Quick Start
|
| 99 |
+
|
| 100 |
+
### Core Capabilities
|
| 101 |
+
- **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 0-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
|
| 102 |
+
- **Dual Interface**: Both CLI and Web UI for flexible usage
|
| 103 |
+
- **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, GLM, Mixtral, etc.)
|
| 104 |
+
- **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
|
| 105 |
+
- **Feasibility Analysis**: Check if your configuration fits on available GPU memory
|
| 106 |
+
- **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
|
| 107 |
+
|
| 108 |
+
### Web UI Enhancements
|
| 109 |
+
- **Formula Explanations**: See exactly how memory is calculated with your values plugged in
|
| 110 |
+
- **Real-time Validation**: Client-side validation prevents invalid configurations
|
| 111 |
+
- **Smart Auto-calculation**: Optimized debouncing (1s) with minimum interval protection
|
| 112 |
+
- **Export Capabilities**: Export to DeepSpeed config files, JSON, or copy to clipboard
|
| 113 |
+
- **Batch Size Optimizer**: Automatically find maximum batch size that fits
|
| 114 |
+
- **Comparison Mode**: Save and compare different configurations side-by-side
|
| 115 |
+
- **Accessibility Features**: ARIA labels, keyboard navigation, colorblind-friendly charts
|
| 116 |
+
|
| 117 |
+
### Advanced Features
|
| 118 |
+
- **MoE Support**: Mixture of Experts models with configurable experts and top-k routing
|
| 119 |
+
- **CPU/NVMe Offloading**: Offload optimizer states and parameters to CPU or NVMe storage
|
| 120 |
+
- **Activation Checkpointing**: 5 levels from none to full checkpointing
|
| 121 |
+
- **Sequence Parallelism**: Optimize memory for long sequences
|
| 122 |
+
- **Result Caching**: Fast repeated calculations with built-in caching
|
| 123 |
+
|
| 124 |
+
```bash
|
| 125 |
+
pip install git+https://github.com/George614/gpu-mem-calculator.git
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
### From source
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
git clone https://github.com/George614/gpu-mem_calculator.git
|
| 132 |
+
cd gpu_mem_calculator
|
| 133 |
+
pip install -e .
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
### For Web UI support
|
| 137 |
+
|
| 138 |
+
```bash
|
| 139 |
+
pip install -e ".[web]"
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### Development installation
|
| 143 |
+
|
| 144 |
+
```bash
|
| 145 |
+
pip install -e ".[dev]"
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
## 🎓 Use Cases
|
| 149 |
+
|
| 150 |
+
### Research & Academia
|
| 151 |
+
- Estimate GPU requirements for research projects before requesting compute resources
|
| 152 |
+
- Plan multi-GPU training configurations for large-scale experiments
|
| 153 |
+
- Compare memory efficiency of different training strategies
|
| 154 |
+
|
| 155 |
+
### Industry & Production
|
| 156 |
+
- Cost optimization: Choose the right GPU type and count for your training workload
|
| 157 |
+
- Capacity planning: Forecast infrastructure needs for model development
|
| 158 |
+
- Debugging: Diagnose OOM errors and optimize memory usage
|
| 159 |
+
|
| 160 |
+
### Education & Learning
|
| 161 |
+
- Understand how training configuration affects memory consumption
|
| 162 |
+
- Learn about different distributed training strategies
|
| 163 |
+
- Experiment with various optimization techniques safely
|
| 164 |
+
|
| 165 |
+
## 🚀 Usage
|
| 166 |
+
|
| 167 |
+
### Command Line Interface
|
| 168 |
+
|
| 169 |
+
#### Using model presets (Recommended)
|
| 170 |
+
|
| 171 |
+
The calculator includes pre-configured model presets for popular LLMs:
|
| 172 |
+
|
| 173 |
+
```bash
|
| 174 |
+
# List all available presets
|
| 175 |
+
gpu-mem-calc presets
|
| 176 |
+
|
| 177 |
+
# Calculate with a preset
|
| 178 |
+
gpu-mem-calc calculate --preset llama2-7b
|
| 179 |
+
gpu-mem-calc calculate --preset mixtral-8x7b --format json
|
| 180 |
+
|
| 181 |
+
# List presets in table format
|
| 182 |
+
gpu-mem-calc presets --format table
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
Available presets include:
|
| 186 |
+
- **Dense Models**: LLaMA 2 (7B, 13B, 70B), GPT-3 (175B)
|
| 187 |
+
- **MoE Models**: Mixtral 8x7B, GLM-4 (9B), GLM-4.7 (355B), GLM-4.5 Air (106B),
|
| 188 |
+
Qwen1.5-MoE-A2.7B, DeepSeek-MoE (16B)
|
| 189 |
+
|
| 190 |
+
#### Calculate from config file
|
| 191 |
+
|
| 192 |
+
```bash
|
| 193 |
+
gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
#### Quick calculation from model size
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
# Calculate memory for 7B model with 8x80GB GPUs using DeepSpeed
|
| 200 |
+
gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
|
| 201 |
+
|
| 202 |
+
# With custom GPU memory
|
| 203 |
+
gpu-mem-calc quick 70 --gpus 64 --gpu-mem 80 --engine megatron
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
#### Validate configuration
|
| 207 |
+
|
| 208 |
+
```bash
|
| 209 |
+
gpu-mem-calc validate configs/my_config.json
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### Web Interface
|
| 213 |
+
|
| 214 |
+
Start the web server:
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
python -m gpu_mem_calculator.web.app
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
Or using uvicorn directly:
|
| 221 |
+
|
| 222 |
+
```bash
|
| 223 |
+
uvicorn gpu_mem_calculator.web.app:app --reload
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
Then open your browser to `http://localhost:8000`
|
| 227 |
+
|
| 228 |
+
### Python API
|
| 229 |
+
|
| 230 |
+
#### Training Memory Calculation
|
| 231 |
+
|
| 232 |
+
```python
|
| 233 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 234 |
+
from gpu_mem_calculator.core.models import (
|
| 235 |
+
ModelConfig,
|
| 236 |
+
TrainingConfig,
|
| 237 |
+
ParallelismConfig,
|
| 238 |
+
EngineConfig,
|
| 239 |
+
GPUConfig,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Create configuration
|
| 243 |
+
model_config = ModelConfig(
|
| 244 |
+
name="llama2-7b",
|
| 245 |
+
num_parameters=7_000_000_000,
|
| 246 |
+
num_layers=32,
|
| 247 |
+
hidden_size=4096,
|
| 248 |
+
num_attention_heads=32,
|
| 249 |
+
vocab_size=32000,
|
| 250 |
+
max_seq_len=4096,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
training_config = TrainingConfig(
|
| 254 |
+
batch_size=4,
|
| 255 |
+
gradient_accumulation_steps=4,
|
| 256 |
+
dtype="bf16",
|
| 257 |
+
optimizer="adamw",
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
parallelism_config = ParallelismConfig(
|
| 261 |
+
data_parallel_size=8,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
engine_config = EngineConfig(
|
| 265 |
+
type="deepspeed",
|
| 266 |
+
zero_stage=3,
|
| 267 |
+
offload_optimizer="cpu",
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
gpu_config = GPUConfig(
|
| 271 |
+
num_gpus=8,
|
| 272 |
+
gpu_memory_gb=80,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# Calculate memory
|
| 276 |
+
calculator = GPUMemoryCalculator(
|
| 277 |
+
model_config=model_config,
|
| 278 |
+
training_config=training_config,
|
| 279 |
+
parallelism_config=parallelism_config,
|
| 280 |
+
engine_config=engine_config,
|
| 281 |
+
gpu_config=gpu_config,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
result = calculator.calculate()
|
| 285 |
+
|
| 286 |
+
print(f"Memory per GPU: {result.total_memory_per_gpu_gb:.2f} GB")
|
| 287 |
+
print(f"Fits on GPU: {result.fits_on_gpu}")
|
| 288 |
+
print(f"Utilization: {result.memory_utilization_percent:.1f}%")
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
#### 🆕 Inference Memory Calculation
|
| 292 |
+
|
| 293 |
+
```python
|
| 294 |
+
from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
|
| 295 |
+
from gpu_mem_calculator.core.models import (
|
| 296 |
+
ModelConfig,
|
| 297 |
+
InferenceConfig,
|
| 298 |
+
InferenceEngineType,
|
| 299 |
+
GPUConfig,
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Create configurations
|
| 303 |
+
model_config = ModelConfig(
|
| 304 |
+
name="llama2-7b",
|
| 305 |
+
num_parameters=7_000_000_000,
|
| 306 |
+
num_layers=32,
|
| 307 |
+
hidden_size=4096,
|
| 308 |
+
num_attention_heads=32,
|
| 309 |
+
max_seq_len=4096,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
inference_config = InferenceConfig(
|
| 313 |
+
batch_size=32,
|
| 314 |
+
kv_cache_quantization="int8", # NONE, INT8, FP8, INT4
|
| 315 |
+
tensor_parallel_size=2,
|
| 316 |
+
gpu_memory_utilization=0.9,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
gpu_config = GPUConfig(num_gpus=2, gpu_memory_gb=80)
|
| 320 |
+
|
| 321 |
+
# Calculate for different inference engines
|
| 322 |
+
calculator = InferenceMemoryCalculator(model_config, inference_config, gpu_config)
|
| 323 |
+
|
| 324 |
+
# vLLM inference
|
| 325 |
+
result_vllm = calculator.calculate(InferenceEngineType.VLLM)
|
| 326 |
+
print(f"vLLM: {result_vllm.total_memory_per_gpu_gb:.2f} GB")
|
| 327 |
+
print(f"Max batch size: {result_vllm.max_supported_batch_size}")
|
| 328 |
+
print(f"Throughput: {result_vllm.estimated_throughput_tokens_per_sec:.0f} tokens/sec")
|
| 329 |
+
|
| 330 |
+
# TensorRT-LLM inference
|
| 331 |
+
result_trt = calculator.calculate(InferenceEngineType.TENSORRT_LLM)
|
| 332 |
+
print(f"TensorRT-LLM: {result_trt.total_memory_per_gpu_gb:.2f} GB")
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
#### 🆕 Multi-Node Network Overhead
|
| 336 |
+
|
| 337 |
+
```python
|
| 338 |
+
from gpu_mem_calculator.core.multinode import MultiNodeCalculator
|
| 339 |
+
from gpu_mem_calculator.core.models import (
|
| 340 |
+
NodeConfig,
|
| 341 |
+
InterconnectType,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# Configure multi-node setup
|
| 345 |
+
node_config = NodeConfig(
|
| 346 |
+
num_nodes=4,
|
| 347 |
+
gpus_per_node=8,
|
| 348 |
+
interconnect_type=InterconnectType.INFINIBAND,
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
calculator = MultiNodeCalculator(
|
| 352 |
+
model_config=model_config,
|
| 353 |
+
training_config=training_config,
|
| 354 |
+
parallelism_config=parallelism_config,
|
| 355 |
+
node_config=node_config,
|
| 356 |
+
engine_config=engine_config,
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
# Calculate network overhead
|
| 360 |
+
network_overhead = calculator.calculate_network_overhead()
|
| 361 |
+
print(f"AllReduce: {network_overhead.allreduce_gb:.2f} GB")
|
| 362 |
+
print(f"AllGather: {network_overhead.allgather_gb:.2f} GB")
|
| 363 |
+
print(f"Time overhead: {network_overhead.estimated_overhead_ms_per_step:.2f} ms/step")
|
| 364 |
+
|
| 365 |
+
# Optimize hybrid parallelism
|
| 366 |
+
from gpu_mem_calculator.core.models import HybridParallelismConfig
|
| 367 |
+
|
| 368 |
+
hybrid_config = HybridParallelismConfig(
|
| 369 |
+
auto_optimize=True,
|
| 370 |
+
prefer_pipeline_parallel=True,
|
| 371 |
+
enable_sequence_parallel=True,
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
optimized_parallelism = calculator.optimize_hybrid_parallelism(hybrid_config)
|
| 375 |
+
print(f"Optimized TP: {optimized_parallelism.tensor_parallel_size}")
|
| 376 |
+
print(f"Optimized PP: {optimized_parallelism.pipeline_parallel_size}")
|
| 377 |
+
print(f"Optimized DP: {optimized_parallelism.data_parallel_size}")
|
| 378 |
+
```
|
| 379 |
+
|
| 380 |
+
#### 🆕 Export Framework Configurations
|
| 381 |
+
|
| 382 |
+
```python
|
| 383 |
+
from gpu_mem_calculator.exporters.manager import ExportManager, ExportFormat
|
| 384 |
+
|
| 385 |
+
# Create export manager
|
| 386 |
+
manager = ExportManager(
|
| 387 |
+
model_config=model_config,
|
| 388 |
+
training_config=training_config,
|
| 389 |
+
parallelism_config=parallelism_config,
|
| 390 |
+
engine_config=engine_config,
|
| 391 |
+
node_config=node_config,
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# Export to different formats
|
| 395 |
+
accelerate_config = manager.export(ExportFormat.ACCELERATE)
|
| 396 |
+
lightning_config = manager.export(ExportFormat.LIGHTNING)
|
| 397 |
+
axolotl_config = manager.export(ExportFormat.AXOLOTL)
|
| 398 |
+
|
| 399 |
+
# Export to file
|
| 400 |
+
manager.export_to_file(ExportFormat.ACCELERATE, "accelerate_config.yaml")
|
| 401 |
+
manager.export_to_file(ExportFormat.JSON, "config.json")
|
| 402 |
+
|
| 403 |
+
# Get DeepSpeed config
|
| 404 |
+
deepspeed_config = manager.export(ExportFormat.DEEPSPEED)
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
## Configuration File Format
|
| 408 |
+
|
| 409 |
+
```json
|
| 410 |
+
{
|
| 411 |
+
"model": {
|
| 412 |
+
"name": "llama2-7b",
|
| 413 |
+
"num_parameters": "7B",
|
| 414 |
+
"num_layers": 32,
|
| 415 |
+
"hidden_size": 4096,
|
| 416 |
+
"num_attention_heads": 32,
|
| 417 |
+
"vocab_size": 32000,
|
| 418 |
+
"max_seq_len": 4096
|
| 419 |
+
},
|
| 420 |
+
"training": {
|
| 421 |
+
"batch_size": 4,
|
| 422 |
+
"gradient_accumulation_steps": 4,
|
| 423 |
+
"optimizer": "adamw",
|
| 424 |
+
"dtype": "bf16",
|
| 425 |
+
"activation_checkpointing": 1
|
| 426 |
+
},
|
| 427 |
+
"parallelism": {
|
| 428 |
+
"tensor_parallel_size": 1,
|
| 429 |
+
"pipeline_parallel_size": 1,
|
| 430 |
+
"data_parallel_size": 8,
|
| 431 |
+
"sequence_parallel": false
|
| 432 |
+
},
|
| 433 |
+
"engine": {
|
| 434 |
+
"type": "deepspeed",
|
| 435 |
+
"zero_stage": 3,
|
| 436 |
+
"offload_optimizer": "cpu",
|
| 437 |
+
"offload_param": "none"
|
| 438 |
+
},
|
| 439 |
+
"hardware": {
|
| 440 |
+
"num_gpus": 8,
|
| 441 |
+
"gpu_memory_gb": 80
|
| 442 |
+
}
|
| 443 |
+
}
|
| 444 |
+
```
|
| 445 |
+
|
| 446 |
+
## Supported Training Engines
|
| 447 |
+
|
| 448 |
+
### PyTorch DDP (Baseline)
|
| 449 |
+
Standard Distributed Data Parallel training without memory optimizations.
|
| 450 |
+
|
| 451 |
+
### DeepSpeed ZeRO
|
| 452 |
+
- **ZeRO-1**: Shard optimizer states
|
| 453 |
+
- **ZeRO-2**: Shard optimizer states + gradients
|
| 454 |
+
- **ZeRO-3**: Shard everything (parameters, gradients, optimizer states)
|
| 455 |
+
- Supports CPU/NVMe offloading
|
| 456 |
+
|
| 457 |
+
### Megatron-LM
|
| 458 |
+
Tensor and pipeline parallelism with activation checkpointing support.
|
| 459 |
+
|
| 460 |
+
### Megatron + DeepSpeed
|
| 461 |
+
Combines Megatron-LM's model parallelism with DeepSpeed ZeRO's optimizer sharding.
|
| 462 |
+
|
| 463 |
+
### PyTorch FSDP
|
| 464 |
+
Fully Sharded Data Parallel with multiple sharding strategies.
|
| 465 |
+
|
| 466 |
+
## Memory Formulas
|
| 467 |
+
|
| 468 |
+
The calculator uses formulas verified against authoritative sources:
|
| 469 |
+
|
| 470 |
+
### Base Components
|
| 471 |
+
|
| 472 |
+
**Model Parameters:**
|
| 473 |
+
- FP16/BF16: `num_params × 2 bytes`
|
| 474 |
+
- FP32: `num_params × 4 bytes`
|
| 475 |
+
|
| 476 |
+
**Gradients:**
|
| 477 |
+
- FP16/BF16: `num_params × 2 bytes`
|
| 478 |
+
- FP32: `num_params × 4 bytes`
|
| 479 |
+
|
| 480 |
+
**Optimizer States** (per optimizer type):
|
| 481 |
+
- **Adam/AdamW**: `num_params × 12 bytes`
|
| 482 |
+
- 4 bytes: FP32 parameter copy
|
| 483 |
+
- 4 bytes: Momentum
|
| 484 |
+
- 4 bytes: Variance
|
| 485 |
+
- **AdamW 8-bit**: `num_params × 2 bytes` (quantized)
|
| 486 |
+
- **SGD**: `num_params × 4 bytes` (FP32 only, no momentum)
|
| 487 |
+
|
| 488 |
+
**Activations:**
|
| 489 |
+
- Approximation: `batch_size × seq_len × hidden_size × num_layers × ~16 bytes/token/layer`
|
| 490 |
+
- Varies based on activation checkpointing level
|
| 491 |
+
|
| 492 |
+
### DeepSpeed ZeRO Stages
|
| 493 |
+
|
| 494 |
+
**ZeRO-0** (Baseline - same as PyTorch DDP):
|
| 495 |
+
```
|
| 496 |
+
total_per_gpu = 2×params + 2×params + 12×params + activations
|
| 497 |
+
= 16×params + activations
|
| 498 |
+
```
|
| 499 |
+
|
| 500 |
+
**ZeRO-1** (Shard optimizer states):
|
| 501 |
+
```
|
| 502 |
+
total_per_gpu = 2×params + 2×params + (12×params)/num_gpus + activations
|
| 503 |
+
```
|
| 504 |
+
|
| 505 |
+
**ZeRO-2** (Shard optimizer + gradients):
|
| 506 |
+
```
|
| 507 |
+
total_per_gpu = 2×params + (2×params)/num_gpus + (12×params)/num_gpus + activations
|
| 508 |
+
```
|
| 509 |
+
|
| 510 |
+
**ZeRO-3** (Shard everything):
|
| 511 |
+
```
|
| 512 |
+
total_per_gpu = largest_layer_memory + (16×params)/num_gpus + activations
|
| 513 |
+
where largest_layer_memory ≈ 4×(num_params/10)
|
| 514 |
+
```
|
| 515 |
+
|
| 516 |
+
**CPU/NVMe Offloading:**
|
| 517 |
+
- Optimizer states offloaded to CPU: 0 GB GPU memory
|
| 518 |
+
- Parameters offloaded to CPU/NVMe: Dynamically gathered during compute
|
| 519 |
+
|
| 520 |
+
### Verification
|
| 521 |
+
|
| 522 |
+
All formulas have been verified against:
|
| 523 |
+
- ✅ 18 comprehensive test scenarios (100% pass rate)
|
| 524 |
+
- ✅ EleutherAI Transformer Math 101
|
| 525 |
+
- ✅ Microsoft Research ZeRO Blog
|
| 526 |
+
- ✅ DeepSpeed Official Documentation
|
| 527 |
+
- ✅ PyTorch FSDP Documentation
|
| 528 |
+
|
| 529 |
+
### References
|
| 530 |
+
|
| 531 |
+
- [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive transformer memory breakdown
|
| 532 |
+
- [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
|
| 533 |
+
- [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
|
| 534 |
+
|
| 535 |
+
## Example Configurations
|
| 536 |
+
|
| 537 |
+
### LLaMA 2 7B with DeepSpeed ZeRO-3
|
| 538 |
+
```bash
|
| 539 |
+
gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
|
| 540 |
+
```
|
| 541 |
+
|
| 542 |
+
### GPT-3 175B with Megatron-LM
|
| 543 |
+
```bash
|
| 544 |
+
gpu-mem-calc calculate --config configs/gpt3_175b_megatron.json
|
| 545 |
+
```
|
| 546 |
+
|
| 547 |
+
### Custom 1B model with PyTorch DDP
|
| 548 |
+
```bash
|
| 549 |
+
gpu-mem-calc calculate --config configs/pytorch_ddp_example.json
|
| 550 |
+
```
|
| 551 |
+
|
| 552 |
+
## Web UI Features
|
| 553 |
+
|
| 554 |
+
### Interactive Interface
|
| 555 |
+
- **Real-time Calculations**: Auto-calculates as you adjust parameters (1s debounce)
|
| 556 |
+
- **Client-side Validation**: Instant feedback on configuration errors before API calls
|
| 557 |
+
- **Smart Presets**: Quick-load model configurations (LLaMA 2, GPT-3, GLM, Mixtral, Qwen, DeepSeek)
|
| 558 |
+
- **Visual Breakdown**: Color-coded bar chart with patterns for colorblind accessibility
|
| 559 |
+
- **Feasibility Status**: Clear indicators showing if configuration fits on GPU
|
| 560 |
+
|
| 561 |
+
### Formula Explanations
|
| 562 |
+
- **Detailed Breakdowns**: See exact formulas used with your values plugged in
|
| 563 |
+
- **Component-by-Component**: Each memory component explained with formula and result
|
| 564 |
+
- **Authoritative References**: Links to EleutherAI, Microsoft Research, DeepSpeed docs
|
| 565 |
+
- **Engine-Specific Details**: Different formulas for PyTorch DDP, DeepSpeed ZeRO, FSDP, Megatron-LM
|
| 566 |
+
|
| 567 |
+
### Advanced Tools
|
| 568 |
+
- **Export to DeepSpeed**: Generate `deepspeed_config.json` files automatically
|
| 569 |
+
- **Batch Size Optimizer**: Find maximum batch size that fits your GPU memory
|
| 570 |
+
- **Config Persistence**: Save configurations to browser localStorage
|
| 571 |
+
- **Comparison Mode**: Compare different configurations side-by-side
|
| 572 |
+
|
| 573 |
+
### Accessibility
|
| 574 |
+
- **ARIA Labels**: Full screen reader support throughout the interface
|
| 575 |
+
- **Keyboard Navigation**: All features accessible via keyboard
|
| 576 |
+
- **Colorblind-Friendly**: Patterns and textures supplement colors in charts
|
| 577 |
+
- **High Contrast**: Clear visual indicators with multiple cues
|
| 578 |
+
|
| 579 |
+
### API Endpoints
|
| 580 |
+
- `POST /api/calculate` - Calculate GPU memory requirements
|
| 581 |
+
- `POST /api/explain-formula` - Get detailed formula explanation
|
| 582 |
+
- `POST /api/export/deepspeed` - Export DeepSpeed config file
|
| 583 |
+
- `POST /api/optimize/batch-size` - Find maximum batch size
|
| 584 |
+
- `GET /api/preset/{preset_name}` - Load model preset
|
| 585 |
+
|
| 586 |
+
## Development
|
| 587 |
+
|
| 588 |
+
### Running Tests
|
| 589 |
+
|
| 590 |
+
```bash
|
| 591 |
+
pytest tests/
|
| 592 |
+
```
|
| 593 |
+
|
| 594 |
+
### Test Coverage
|
| 595 |
+
|
| 596 |
+
The calculator includes comprehensive testing:
|
| 597 |
+
- **Unit Tests**: Core calculation logic for each engine type
|
| 598 |
+
- **Integration Tests**: End-to-end configuration validation
|
| 599 |
+
- **Formula Verification**: 18 scenarios verifying formula accuracy
|
| 600 |
+
- **API Tests**: Web API endpoint testing
|
| 601 |
+
- **Accessibility Tests**: Screen reader and keyboard navigation
|
| 602 |
+
|
| 603 |
+
All formulas verified accurate against authoritative sources with 100% test pass rate.
|
| 604 |
+
|
| 605 |
+
### Code Formatting
|
| 606 |
+
|
| 607 |
+
```bash
|
| 608 |
+
black src/ cli/ web/
|
| 609 |
+
ruff check src/ cli/ web/
|
| 610 |
+
```
|
| 611 |
+
|
| 612 |
+
### Type Checking
|
| 613 |
+
|
| 614 |
+
```bash
|
| 615 |
+
mypy src/
|
| 616 |
+
```
|
| 617 |
+
|
| 618 |
+
## Recent Improvements
|
| 619 |
+
|
| 620 |
+
### Latest Updates
|
| 621 |
+
- ✨ Added formula explanation feature with detailed breakdowns
|
| 622 |
+
- ✨ Added client-side validation for better UX
|
| 623 |
+
- ✨ Added batch size optimizer API
|
| 624 |
+
- ✨ Added DeepSpeed config export functionality
|
| 625 |
+
- ✨ Added comprehensive input validation
|
| 626 |
+
- ✨ Added result caching for performance
|
| 627 |
+
- ♿ Added ARIA labels for full accessibility
|
| 628 |
+
- ♿ Added colorblind patterns to charts
|
| 629 |
+
- 🐛 Fixed optimizer formulas to be optimizer-specific
|
| 630 |
+
- 🐛 Fixed Pydantic namespace warnings
|
| 631 |
+
|
| 632 |
+
### Verification Status
|
| 633 |
+
- ✅ All 18 test scenarios passing (100%)
|
| 634 |
+
- ✅ Formulas verified against EleutherAI, Microsoft Research, DeepSpeed docs
|
| 635 |
+
- ✅ Optimizer formulas corrected for AdamW, AdamW 8-bit, and SGD
|
| 636 |
+
- ✅ ZeRO stage formulas validated (0, 1, 2, 3)
|
| 637 |
+
- ✅ Engine type formulas validated (PyTorch DDP, DeepSpeed, FSDP, Megatron-LM)
|
| 638 |
+
|
| 639 |
+
## Contributing
|
| 640 |
+
|
| 641 |
+
Contributions are welcome! Please feel free to submit a Pull Request. See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
|
| 642 |
+
|
| 643 |
+
## 📚 References
|
| 644 |
+
|
| 645 |
+
The memory calculations in this tool are based on authoritative sources:
|
| 646 |
+
|
| 647 |
+
**Core Memory Formulas:**
|
| 648 |
+
- [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive breakdown of transformer memory requirements
|
| 649 |
+
- [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
|
| 650 |
+
- [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2204.13323) - Activation checkpointing strategies
|
| 651 |
+
|
| 652 |
+
**Engine Documentation:**
|
| 653 |
+
- [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
|
| 654 |
+
- [NVIDIA Megatron-LM](https://github.com/NVIDIA/Megatron-LM) - Tensor and pipeline parallelism
|
| 655 |
+
- [PyTorch FSDP Documentation](https://pytorch.org/docs/stable/fsdp.html) - Fully sharded data parallel
|
| 656 |
+
- [PyTorch DDP Tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) - Distributed data parallel
|
| 657 |
+
|
| 658 |
+
**Related Tools:**
|
| 659 |
+
- [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis
|
| 660 |
+
- [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation utilities
|
| 661 |
+
|
| 662 |
+
## 🤝 Community & Support
|
| 663 |
+
|
| 664 |
+
- 📖 [Documentation](README.md)
|
| 665 |
+
- 🐛 [Issue Tracker](https://github.com/George614/gpu-mem-calculator/issues)
|
| 666 |
+
- 💬 [Discussions](https://github.com/George614/gpu-mem-calculator/discussions)
|
| 667 |
+
- 📧 Contact the maintainers via GitHub
|
| 668 |
+
|
| 669 |
+
### Star History
|
| 670 |
+
|
| 671 |
+
If you find this tool useful, please consider giving it a star! ⭐
|
| 672 |
+
|
| 673 |
+
## 📋 Roadmap
|
| 674 |
+
|
| 675 |
+
- [x] Inference memory calculation
|
| 676 |
+
- [x] Multi-node training configurations
|
| 677 |
+
- [x] Export to training framework configs (Accelerate, Lightning, Axolotl)
|
| 678 |
+
- [ ] PyPI package distribution
|
| 679 |
+
- [ ] Support for more model architectures (Vision Transformers, Diffusion models)
|
| 680 |
+
- [ ] Real-time memory monitoring dashboard
|
| 681 |
+
- [ ] CLI commands for inference and export features
|
| 682 |
+
|
| 683 |
+
## 🙏 Acknowledgments
|
| 684 |
+
|
| 685 |
+
This tool was inspired by and builds upon the excellent work of:
|
| 686 |
+
- [DeepSpeed Memory Estimator](https://deepspeed.readthedocs.io/en/latest/memory.html) - ZeRO memory optimization formulas
|
| 687 |
+
- [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis methodology
|
| 688 |
+
- [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation approach
|
| 689 |
+
|
| 690 |
+
Special thanks to the EleutherAI community for their comprehensive [Transformer Math 101](https://blog.eleuther.ai/transformer-math/) guide, which provides detailed formulas for transformer memory calculations.
|
| 691 |
+
|
| 692 |
+
## 📄 License
|
| 693 |
+
|
| 694 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
| 695 |
+
|
| 696 |
+
## 📚 Citation
|
| 697 |
+
|
| 698 |
+
If you use this tool in your research, please cite:
|
| 699 |
+
|
| 700 |
+
```bibtex
|
| 701 |
+
@software{gpu_mem_calculator,
|
| 702 |
+
title = {GPU Memory Calculator for LLM Training},
|
| 703 |
+
author = {GPU Mem Calculator Team},
|
| 704 |
+
year = {2024},
|
| 705 |
+
url = {https://github.com/George614/gpu-mem-calculator}
|
| 706 |
+
}
|
| 707 |
+
```
|
| 708 |
+
|
| 709 |
+
---
|
| 710 |
+
|
| 711 |
+
<p align="center">
|
| 712 |
+
Made with ❤️ for the ML community
|
| 713 |
+
</p>
|
| 714 |
+
|
| 715 |
+
<p align="center">
|
| 716 |
+
<a href="https://github.com/George614/gpu-mem-calculator/stargazers">⭐ Star us on GitHub</a> •
|
| 717 |
+
<a href="https://github.com/George614/gpu-mem-calculator/issues">🐛 Report a Bug</a> •
|
| 718 |
+
<a href="https://github.com/George614/gpu-mem-calculator/issues">💡 Request a Feature</a>
|
| 719 |
+
</p>
|
| 720 |
+
|
src/gpu_mem_calculator.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
README.md
|
| 3 |
+
pyproject.toml
|
| 4 |
+
src/gpu_mem_calculator/__init__.py
|
| 5 |
+
src/gpu_mem_calculator/py.typed
|
| 6 |
+
src/gpu_mem_calculator.egg-info/PKG-INFO
|
| 7 |
+
src/gpu_mem_calculator.egg-info/SOURCES.txt
|
| 8 |
+
src/gpu_mem_calculator.egg-info/dependency_links.txt
|
| 9 |
+
src/gpu_mem_calculator.egg-info/entry_points.txt
|
| 10 |
+
src/gpu_mem_calculator.egg-info/requires.txt
|
| 11 |
+
src/gpu_mem_calculator.egg-info/top_level.txt
|
| 12 |
+
src/gpu_mem_calculator/cli/__init__.py
|
| 13 |
+
src/gpu_mem_calculator/cli/main.py
|
| 14 |
+
src/gpu_mem_calculator/config/__init__.py
|
| 15 |
+
src/gpu_mem_calculator/config/parser.py
|
| 16 |
+
src/gpu_mem_calculator/config/presets.py
|
| 17 |
+
src/gpu_mem_calculator/core/__init__.py
|
| 18 |
+
src/gpu_mem_calculator/core/calculator.py
|
| 19 |
+
src/gpu_mem_calculator/core/formulas.py
|
| 20 |
+
src/gpu_mem_calculator/core/models.py
|
| 21 |
+
src/gpu_mem_calculator/core/multinode.py
|
| 22 |
+
src/gpu_mem_calculator/engines/__init__.py
|
| 23 |
+
src/gpu_mem_calculator/engines/base.py
|
| 24 |
+
src/gpu_mem_calculator/engines/deepspeed.py
|
| 25 |
+
src/gpu_mem_calculator/engines/fsdp.py
|
| 26 |
+
src/gpu_mem_calculator/engines/megatron.py
|
| 27 |
+
src/gpu_mem_calculator/engines/pytorch.py
|
| 28 |
+
src/gpu_mem_calculator/exporters/__init__.py
|
| 29 |
+
src/gpu_mem_calculator/exporters/accelerate.py
|
| 30 |
+
src/gpu_mem_calculator/exporters/axolotl.py
|
| 31 |
+
src/gpu_mem_calculator/exporters/lightning.py
|
| 32 |
+
src/gpu_mem_calculator/exporters/manager.py
|
| 33 |
+
src/gpu_mem_calculator/inference/__init__.py
|
| 34 |
+
src/gpu_mem_calculator/inference/base.py
|
| 35 |
+
src/gpu_mem_calculator/inference/calculator.py
|
| 36 |
+
src/gpu_mem_calculator/inference/huggingface.py
|
| 37 |
+
src/gpu_mem_calculator/inference/tensorrt_llm.py
|
| 38 |
+
src/gpu_mem_calculator/inference/tgi.py
|
| 39 |
+
src/gpu_mem_calculator/inference/vllm.py
|
| 40 |
+
src/gpu_mem_calculator/utils/__init__.py
|
| 41 |
+
src/gpu_mem_calculator/utils/precision.py
|
| 42 |
+
tests/test_calculator.py
|
| 43 |
+
tests/test_comprehensive.py
|
| 44 |
+
tests/test_exporters.py
|
| 45 |
+
tests/test_inference.py
|
| 46 |
+
tests/test_multinode.py
|
src/gpu_mem_calculator.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/gpu_mem_calculator.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
gpu-mem-calc = gpu_mem_calculator.cli:main
|
src/gpu_mem_calculator.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic>=2.0.0
|
| 2 |
+
click>=8.1.0
|
| 3 |
+
pydantic-settings>=2.0.0
|
| 4 |
+
rich>=13.0.0
|
| 5 |
+
|
| 6 |
+
[dev]
|
| 7 |
+
pytest>=7.0.0
|
| 8 |
+
pytest-cov>=4.0.0
|
| 9 |
+
black>=23.0.0
|
| 10 |
+
ruff>=0.1.0
|
| 11 |
+
mypy>=1.5.0
|
| 12 |
+
|
| 13 |
+
[web]
|
| 14 |
+
fastapi>=0.100.0
|
| 15 |
+
uvicorn[standard]>=0.23.0
|
| 16 |
+
jinja2>=3.1.0
|
src/gpu_mem_calculator.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gpu_mem_calculator
|
src/gpu_mem_calculator/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GPU Memory Calculator for LLM Training."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (257 Bytes). View file
|
|
|
src/gpu_mem_calculator/cli/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI interface for GPU Memory Calculator."""
|
| 2 |
+
|
| 3 |
+
from gpu_mem_calculator.cli.main import main
|
| 4 |
+
|
| 5 |
+
__all__ = ["main"]
|
src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (322 Bytes). View file
|
|
|
src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (14.3 kB). View file
|
|
|
src/gpu_mem_calculator/cli/main.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI interface for GPU Memory Calculator."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import TYPE_CHECKING, Literal
|
| 7 |
+
|
| 8 |
+
import click
|
| 9 |
+
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 12 |
+
from gpu_mem_calculator.core.models import MemoryResult
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@click.group()
|
| 16 |
+
@click.version_option(version="0.1.0")
|
| 17 |
+
def main() -> None:
|
| 18 |
+
"""GPU Memory Calculator for LLM Training.
|
| 19 |
+
|
| 20 |
+
Calculate GPU memory requirements for training Large Language Models
|
| 21 |
+
with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
|
| 22 |
+
"""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@main.command()
|
| 27 |
+
@click.option(
|
| 28 |
+
"--config",
|
| 29 |
+
"-c",
|
| 30 |
+
type=click.Path(exists=True),
|
| 31 |
+
help="Path to JSON configuration file",
|
| 32 |
+
)
|
| 33 |
+
@click.option(
|
| 34 |
+
"--preset",
|
| 35 |
+
"-p",
|
| 36 |
+
type=str,
|
| 37 |
+
help="Name of a preset model configuration",
|
| 38 |
+
)
|
| 39 |
+
@click.option(
|
| 40 |
+
"--output",
|
| 41 |
+
"-o",
|
| 42 |
+
type=click.Path(),
|
| 43 |
+
help="Output file path (default: stdout)",
|
| 44 |
+
)
|
| 45 |
+
@click.option(
|
| 46 |
+
"--format",
|
| 47 |
+
"-f",
|
| 48 |
+
type=click.Choice(["json", "yaml", "table"]),
|
| 49 |
+
default="table",
|
| 50 |
+
help="Output format (default: table)",
|
| 51 |
+
)
|
| 52 |
+
def calculate(
|
| 53 |
+
config: str | None,
|
| 54 |
+
preset: str | None,
|
| 55 |
+
output: str | None,
|
| 56 |
+
format: Literal["json", "yaml", "table"],
|
| 57 |
+
) -> None:
|
| 58 |
+
"""Calculate GPU memory requirements from config file or preset.
|
| 59 |
+
|
| 60 |
+
Examples:
|
| 61 |
+
gpu-mem-calc calculate --config configs/llama2_7b.json
|
| 62 |
+
gpu-mem-calc calculate --preset llama2-7b
|
| 63 |
+
gpu-mem-calc calculate -p mixtral-8x7b --format json
|
| 64 |
+
"""
|
| 65 |
+
if not config and not preset:
|
| 66 |
+
click.echo("Error: Either --config or --preset is required", err=True)
|
| 67 |
+
sys.exit(1)
|
| 68 |
+
|
| 69 |
+
if config and preset:
|
| 70 |
+
click.echo("Error: Cannot use both --config and --preset", err=True)
|
| 71 |
+
sys.exit(1)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
import tempfile
|
| 75 |
+
|
| 76 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 77 |
+
|
| 78 |
+
if preset:
|
| 79 |
+
# Load preset configuration
|
| 80 |
+
from gpu_mem_calculator.config.presets import get_preset_config
|
| 81 |
+
|
| 82 |
+
preset_config = get_preset_config(preset)
|
| 83 |
+
if preset_config is None:
|
| 84 |
+
click.echo(
|
| 85 |
+
f"Error: Preset '{preset}' not found. "
|
| 86 |
+
"Use 'gpu-mem-calc presets' to list available presets.",
|
| 87 |
+
err=True,
|
| 88 |
+
)
|
| 89 |
+
sys.exit(1)
|
| 90 |
+
|
| 91 |
+
# Write preset to temp file for from_config_file
|
| 92 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
| 93 |
+
json.dump(preset_config, f, indent=2)
|
| 94 |
+
temp_path = f.name
|
| 95 |
+
|
| 96 |
+
calculator = GPUMemoryCalculator.from_config_file(temp_path)
|
| 97 |
+
Path(temp_path).unlink() # Clean up temp file
|
| 98 |
+
elif config:
|
| 99 |
+
calculator = GPUMemoryCalculator.from_config_file(config)
|
| 100 |
+
else:
|
| 101 |
+
# This should never happen due to the checks above
|
| 102 |
+
click.echo("Error: Either --config or --preset is required", err=True)
|
| 103 |
+
sys.exit(1)
|
| 104 |
+
|
| 105 |
+
result = calculator.calculate()
|
| 106 |
+
|
| 107 |
+
# Format output
|
| 108 |
+
if format == "json":
|
| 109 |
+
output_text = json.dumps(result.model_dump(mode="json"), indent=2)
|
| 110 |
+
elif format == "yaml":
|
| 111 |
+
try:
|
| 112 |
+
import yaml # type: ignore[import-untyped]
|
| 113 |
+
|
| 114 |
+
output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
|
| 115 |
+
except ImportError:
|
| 116 |
+
click.echo(
|
| 117 |
+
"Error: YAML format requires PyYAML. Install with: pip install pyyaml",
|
| 118 |
+
err=True,
|
| 119 |
+
)
|
| 120 |
+
sys.exit(1)
|
| 121 |
+
else: # table
|
| 122 |
+
output_text = _format_result_as_table(result, calculator)
|
| 123 |
+
|
| 124 |
+
# Write output
|
| 125 |
+
if output:
|
| 126 |
+
Path(output).write_text(output_text)
|
| 127 |
+
click.echo(f"Results written to {output}")
|
| 128 |
+
else:
|
| 129 |
+
click.echo(output_text)
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
click.echo(f"Error: {e}", err=True)
|
| 133 |
+
sys.exit(1)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@main.command()
|
| 137 |
+
@click.argument(
|
| 138 |
+
"params",
|
| 139 |
+
type=float,
|
| 140 |
+
required=True,
|
| 141 |
+
)
|
| 142 |
+
@click.option(
|
| 143 |
+
"--gpus",
|
| 144 |
+
"-g",
|
| 145 |
+
type=int,
|
| 146 |
+
default=1,
|
| 147 |
+
help="Number of GPUs (default: 1)",
|
| 148 |
+
)
|
| 149 |
+
@click.option(
|
| 150 |
+
"--gpu-mem",
|
| 151 |
+
"-m",
|
| 152 |
+
type=float,
|
| 153 |
+
default=80.0,
|
| 154 |
+
help="GPU memory in GB (default: 80.0)",
|
| 155 |
+
)
|
| 156 |
+
@click.option(
|
| 157 |
+
"--engine",
|
| 158 |
+
"-e",
|
| 159 |
+
type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
|
| 160 |
+
default="pytorch",
|
| 161 |
+
help="Training engine (default: pytorch)",
|
| 162 |
+
)
|
| 163 |
+
@click.option(
|
| 164 |
+
"--dtype",
|
| 165 |
+
"-d",
|
| 166 |
+
type=click.Choice(["fp32", "fp16", "bf16"]),
|
| 167 |
+
default="bf16",
|
| 168 |
+
help="Data type (default: bf16)",
|
| 169 |
+
)
|
| 170 |
+
def quick(
|
| 171 |
+
params: float,
|
| 172 |
+
gpus: int,
|
| 173 |
+
gpu_mem: float,
|
| 174 |
+
engine: str,
|
| 175 |
+
dtype: str,
|
| 176 |
+
) -> None:
|
| 177 |
+
"""Quick calculation from model size (in billions of parameters).
|
| 178 |
+
|
| 179 |
+
Example:
|
| 180 |
+
gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
|
| 181 |
+
"""
|
| 182 |
+
try:
|
| 183 |
+
from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
| 184 |
+
from gpu_mem_calculator.core.models import (
|
| 185 |
+
DType,
|
| 186 |
+
EngineConfig,
|
| 187 |
+
EngineType,
|
| 188 |
+
GPUConfig,
|
| 189 |
+
ModelConfig,
|
| 190 |
+
ParallelismConfig,
|
| 191 |
+
TrainingConfig,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Map engine string to EngineType
|
| 195 |
+
engine_map = {
|
| 196 |
+
"pytorch": EngineType.PYTORCH_DDP,
|
| 197 |
+
"deepspeed": EngineType.DEEPSPEED,
|
| 198 |
+
"megatron": EngineType.MEGATRON_LM,
|
| 199 |
+
"fsdp": EngineType.FSDP,
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Map dtype string to DType
|
| 203 |
+
dtype_map = {
|
| 204 |
+
"fp32": DType.FP32,
|
| 205 |
+
"fp16": DType.FP16,
|
| 206 |
+
"bf16": DType.BF16,
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
# Create a minimal config for quick calculation
|
| 210 |
+
# Estimate model architecture from parameter count
|
| 211 |
+
# Rough approximation based on typical transformer models
|
| 212 |
+
num_params = int(params * 1e9)
|
| 213 |
+
|
| 214 |
+
# Estimate hidden size and layers from param count
|
| 215 |
+
# These are rough approximations
|
| 216 |
+
if params <= 1:
|
| 217 |
+
hidden_size, num_layers = 768, 12
|
| 218 |
+
elif params <= 7:
|
| 219 |
+
hidden_size, num_layers = 4096, 32
|
| 220 |
+
elif params <= 13:
|
| 221 |
+
hidden_size, num_layers = 5120, 40
|
| 222 |
+
elif params <= 30:
|
| 223 |
+
hidden_size, num_layers = 6656, 60
|
| 224 |
+
elif params <= 65:
|
| 225 |
+
hidden_size, num_layers = 8192, 80
|
| 226 |
+
else:
|
| 227 |
+
hidden_size, num_layers = 12288, 96
|
| 228 |
+
|
| 229 |
+
model_config = ModelConfig(
|
| 230 |
+
name="quick-estimate",
|
| 231 |
+
num_parameters=num_params,
|
| 232 |
+
num_layers=num_layers,
|
| 233 |
+
hidden_size=hidden_size,
|
| 234 |
+
num_attention_heads=hidden_size // 128,
|
| 235 |
+
vocab_size=32000,
|
| 236 |
+
max_seq_len=2048,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
training_config = TrainingConfig(
|
| 240 |
+
batch_size=1,
|
| 241 |
+
gradient_accumulation_steps=1,
|
| 242 |
+
dtype=dtype_map[dtype],
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
parallelism_config = ParallelismConfig(data_parallel_size=gpus)
|
| 246 |
+
|
| 247 |
+
engine_config = EngineConfig(
|
| 248 |
+
type=engine_map[engine],
|
| 249 |
+
zero_stage=2 if engine == "deepspeed" else None,
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
|
| 253 |
+
|
| 254 |
+
calculator = GPUMemoryCalculator(
|
| 255 |
+
model_config=model_config,
|
| 256 |
+
training_config=training_config,
|
| 257 |
+
parallelism_config=parallelism_config,
|
| 258 |
+
engine_config=engine_config,
|
| 259 |
+
gpu_config=gpu_config,
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
result = calculator.calculate()
|
| 263 |
+
|
| 264 |
+
# Display results
|
| 265 |
+
click.echo(_format_result_as_table(result, calculator))
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
click.echo(f"Error: {e}", err=True)
|
| 269 |
+
sys.exit(1)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
@main.command()
|
| 273 |
+
@click.argument(
|
| 274 |
+
"config_path",
|
| 275 |
+
type=click.Path(exists=True),
|
| 276 |
+
)
|
| 277 |
+
def validate(config_path: str) -> None:
|
| 278 |
+
"""Validate a configuration file.
|
| 279 |
+
|
| 280 |
+
Example:
|
| 281 |
+
gpu-mem-calc validate configs/my_config.json
|
| 282 |
+
"""
|
| 283 |
+
try:
|
| 284 |
+
from gpu_mem_calculator.config import ConfigParser
|
| 285 |
+
|
| 286 |
+
ConfigParser.parse_full_config(config_path)
|
| 287 |
+
click.echo(f"✓ Configuration file '{config_path}' is valid")
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
click.echo(f"✗ Validation failed: {e}", err=True)
|
| 291 |
+
sys.exit(1)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
@main.command()
|
| 295 |
+
@click.option(
|
| 296 |
+
"--format",
|
| 297 |
+
"-f",
|
| 298 |
+
type=click.Choice(["list", "json", "table"]),
|
| 299 |
+
default="list",
|
| 300 |
+
help="Output format (default: list)",
|
| 301 |
+
)
|
| 302 |
+
def presets(format: str) -> None:
|
| 303 |
+
"""List available model preset configurations.
|
| 304 |
+
|
| 305 |
+
Examples:
|
| 306 |
+
gpu-mem-calc presets
|
| 307 |
+
gpu-mem-calc presets --format table
|
| 308 |
+
gpu-mem-calc presets -f json
|
| 309 |
+
"""
|
| 310 |
+
try:
|
| 311 |
+
from gpu_mem_calculator.config.presets import list_presets
|
| 312 |
+
|
| 313 |
+
all_presets = list_presets()
|
| 314 |
+
|
| 315 |
+
if not all_presets:
|
| 316 |
+
click.echo("No presets found.")
|
| 317 |
+
return
|
| 318 |
+
|
| 319 |
+
if format == "json":
|
| 320 |
+
click.echo(json.dumps(all_presets, indent=2))
|
| 321 |
+
elif format == "table":
|
| 322 |
+
from rich.console import Console
|
| 323 |
+
from rich.table import Table
|
| 324 |
+
|
| 325 |
+
console = Console()
|
| 326 |
+
table = Table(
|
| 327 |
+
title="Available Model Presets",
|
| 328 |
+
show_header=True,
|
| 329 |
+
header_style="bold magenta",
|
| 330 |
+
)
|
| 331 |
+
table.add_column("Preset Name", style="cyan", width=25)
|
| 332 |
+
table.add_column("Display Name", style="green", width=30)
|
| 333 |
+
table.add_column("Description", style="yellow")
|
| 334 |
+
|
| 335 |
+
for name, info in sorted(all_presets.items()):
|
| 336 |
+
table.add_row(name, info["display_name"], info["description"])
|
| 337 |
+
|
| 338 |
+
console.print(table)
|
| 339 |
+
else: # list format
|
| 340 |
+
click.echo("Available model presets:\n")
|
| 341 |
+
for name, info in sorted(all_presets.items()): # type: ignore[annotation-unchecked]
|
| 342 |
+
click.echo(f" {name:25} - {info['display_name']}")
|
| 343 |
+
if info.get("description"):
|
| 344 |
+
click.echo(f"{'':27}{info['description']}")
|
| 345 |
+
click.echo()
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
click.echo(f"Error: {e}", err=True)
|
| 349 |
+
sys.exit(1)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
|
| 353 |
+
"""Format result as ASCII table."""
|
| 354 |
+
from rich.console import Console
|
| 355 |
+
from rich.table import Table
|
| 356 |
+
|
| 357 |
+
console = Console()
|
| 358 |
+
|
| 359 |
+
# Main results table
|
| 360 |
+
table = Table(
|
| 361 |
+
title="GPU Memory Calculation Results",
|
| 362 |
+
show_header=True,
|
| 363 |
+
header_style="bold magenta",
|
| 364 |
+
)
|
| 365 |
+
table.add_column("Metric", style="cyan", width=30)
|
| 366 |
+
table.add_column("Value", style="green")
|
| 367 |
+
|
| 368 |
+
# Memory results
|
| 369 |
+
table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
|
| 370 |
+
table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
|
| 371 |
+
table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
|
| 372 |
+
table.add_row("", "") # Spacer
|
| 373 |
+
|
| 374 |
+
# Breakdown
|
| 375 |
+
table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
|
| 376 |
+
table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
|
| 377 |
+
table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
|
| 378 |
+
table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
|
| 379 |
+
table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
|
| 380 |
+
table.add_row("", "") # Spacer
|
| 381 |
+
|
| 382 |
+
# Feasibility
|
| 383 |
+
status = "✓ Fits" if result.fits_on_gpu else "✗ OOM"
|
| 384 |
+
table.add_row("Status", status)
|
| 385 |
+
table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
|
| 386 |
+
if result.recommended_batch_size:
|
| 387 |
+
table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
|
| 388 |
+
|
| 389 |
+
# Capture table output
|
| 390 |
+
from io import StringIO
|
| 391 |
+
|
| 392 |
+
buffer = StringIO()
|
| 393 |
+
console.file = buffer
|
| 394 |
+
console.print(table)
|
| 395 |
+
return buffer.getvalue()
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
if __name__ == "__main__":
|
| 399 |
+
main()
|
src/gpu_mem_calculator/config/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration parsing and defaults."""
|
| 2 |
+
|
| 3 |
+
from gpu_mem_calculator.config.parser import ConfigParser, load_config, save_config
|
| 4 |
+
|
| 5 |
+
__all__ = ["ConfigParser", "load_config", "save_config"]
|
src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (375 Bytes). View file
|
|
|
src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc
ADDED
|
Binary file (3.35 kB). View file
|
|
|
src/gpu_mem_calculator/config/parser.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration file parser and utilities."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, cast
|
| 6 |
+
|
| 7 |
+
from pydantic import ValidationError
|
| 8 |
+
|
| 9 |
+
from gpu_mem_calculator.core.models import (
|
| 10 |
+
DType,
|
| 11 |
+
EngineConfig,
|
| 12 |
+
EngineType,
|
| 13 |
+
GPUConfig,
|
| 14 |
+
ModelConfig,
|
| 15 |
+
OffloadDevice,
|
| 16 |
+
OptimizerType,
|
| 17 |
+
ParallelismConfig,
|
| 18 |
+
TrainingConfig,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ConfigParseError(Exception):
|
| 23 |
+
"""Error parsing configuration file."""
|
| 24 |
+
|
| 25 |
+
def __init__(self, message: str, errors: list[Any] | None = None):
|
| 26 |
+
super().__init__(message)
|
| 27 |
+
self.errors = errors or []
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ConfigParser:
|
| 31 |
+
"""Parse and validate configuration files."""
|
| 32 |
+
|
| 33 |
+
@staticmethod
|
| 34 |
+
def _convert_dtype(value: str) -> DType:
|
| 35 |
+
"""Convert string dtype to DType enum."""
|
| 36 |
+
dtype_map = {
|
| 37 |
+
"float32": DType.FP32,
|
| 38 |
+
"fp32": DType.FP32,
|
| 39 |
+
"float16": DType.FP16,
|
| 40 |
+
"fp16": DType.FP16,
|
| 41 |
+
"bfloat16": DType.BF16,
|
| 42 |
+
"bf16": DType.BF16,
|
| 43 |
+
"int8": DType.INT8,
|
| 44 |
+
"int4": DType.INT4,
|
| 45 |
+
}
|
| 46 |
+
return dtype_map.get(value.lower(), DType.BF16)
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def _convert_optimizer(value: str) -> OptimizerType:
|
| 50 |
+
"""Convert string optimizer to OptimizerType enum."""
|
| 51 |
+
opt_map = {
|
| 52 |
+
"adam": OptimizerType.ADAM,
|
| 53 |
+
"adamw": OptimizerType.ADAMW,
|
| 54 |
+
"sgd": OptimizerType.SGD,
|
| 55 |
+
"adamw_8bit": OptimizerType.ADAMW_8BIT,
|
| 56 |
+
"adamw-8bit": OptimizerType.ADAMW_8BIT,
|
| 57 |
+
}
|
| 58 |
+
return opt_map.get(value.lower(), OptimizerType.ADAMW)
|
| 59 |
+
|
| 60 |
+
@staticmethod
|
| 61 |
+
def _convert_engine(value: str) -> EngineType:
|
| 62 |
+
"""Convert string engine to EngineType enum."""
|
| 63 |
+
engine_map = {
|
| 64 |
+
"pytorch": EngineType.PYTORCH_DDP,
|
| 65 |
+
"pytorch_ddp": EngineType.PYTORCH_DDP,
|
| 66 |
+
"ddp": EngineType.PYTORCH_DDP,
|
| 67 |
+
"deepspeed": EngineType.DEEPSPEED,
|
| 68 |
+
"megatron": EngineType.MEGATRON_LM,
|
| 69 |
+
"megatron_lm": EngineType.MEGATRON_LM,
|
| 70 |
+
"megatron-lm": EngineType.MEGATRON_LM,
|
| 71 |
+
"fsdp": EngineType.FSDP,
|
| 72 |
+
"megatron_deepspeed": EngineType.MEGATRON_DEEPSPEED,
|
| 73 |
+
}
|
| 74 |
+
return engine_map.get(value.lower(), EngineType.PYTORCH_DDP)
|
| 75 |
+
|
| 76 |
+
@staticmethod
|
| 77 |
+
def _convert_offload(value: str) -> OffloadDevice:
|
| 78 |
+
"""Convert string offload to OffloadDevice enum."""
|
| 79 |
+
offload_map = {
|
| 80 |
+
"none": OffloadDevice.NONE,
|
| 81 |
+
"cpu": OffloadDevice.CPU,
|
| 82 |
+
"nvme": OffloadDevice.NVME,
|
| 83 |
+
}
|
| 84 |
+
return offload_map.get(value.lower(), OffloadDevice.NONE)
|
| 85 |
+
|
| 86 |
+
@staticmethod
|
| 87 |
+
def _parse_num_params(value: str | int | float) -> int:
|
| 88 |
+
"""Parse number of parameters from various formats.
|
| 89 |
+
|
| 90 |
+
Supports:
|
| 91 |
+
- Raw integer: 7000000000
|
| 92 |
+
- Billions: "7B", "7b", "7e9"
|
| 93 |
+
- Millions: "7000M", "7000m", "7000e6"
|
| 94 |
+
"""
|
| 95 |
+
if isinstance(value, int):
|
| 96 |
+
return value
|
| 97 |
+
if isinstance(value, float):
|
| 98 |
+
return int(value)
|
| 99 |
+
|
| 100 |
+
if isinstance(value, str):
|
| 101 |
+
value = value.strip().upper()
|
| 102 |
+
|
| 103 |
+
# Handle billions suffix
|
| 104 |
+
if value.endswith("B"):
|
| 105 |
+
return int(float(value[:-1]) * 1_000_000_000)
|
| 106 |
+
|
| 107 |
+
# Handle millions suffix
|
| 108 |
+
if value.endswith("M"):
|
| 109 |
+
return int(float(value[:-1]) * 1_000_000)
|
| 110 |
+
|
| 111 |
+
# Handle scientific notation
|
| 112 |
+
if "E" in value:
|
| 113 |
+
return int(float(value))
|
| 114 |
+
|
| 115 |
+
# Try direct conversion
|
| 116 |
+
return int(value)
|
| 117 |
+
|
| 118 |
+
raise ValueError(f"Cannot parse parameter count: {value}")
|
| 119 |
+
|
| 120 |
+
@classmethod
|
| 121 |
+
def parse_model_config(cls, data: dict[str, Any]) -> ModelConfig:
|
| 122 |
+
"""Parse model configuration from dict.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
data: Dictionary with model configuration
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
ModelConfig object
|
| 129 |
+
|
| 130 |
+
Raises:
|
| 131 |
+
ConfigParseError: If validation fails
|
| 132 |
+
"""
|
| 133 |
+
try:
|
| 134 |
+
# Convert parameter count if it's a string
|
| 135 |
+
if "num_parameters" in data and isinstance(data["num_parameters"], str):
|
| 136 |
+
data["num_parameters"] = cls._parse_num_params(data["num_parameters"])
|
| 137 |
+
|
| 138 |
+
if "largest_layer_params" in data and isinstance(data["largest_layer_params"], str):
|
| 139 |
+
data["largest_layer_params"] = cls._parse_num_params(data["largest_layer_params"])
|
| 140 |
+
|
| 141 |
+
return ModelConfig(**data)
|
| 142 |
+
except ValidationError as e:
|
| 143 |
+
raise ConfigParseError("Invalid model configuration", e.errors()) from e
|
| 144 |
+
|
| 145 |
+
@classmethod
|
| 146 |
+
def parse_training_config(cls, data: dict[str, Any]) -> TrainingConfig:
|
| 147 |
+
"""Parse training configuration from dict.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
data: Dictionary with training configuration
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
TrainingConfig object
|
| 154 |
+
|
| 155 |
+
Raises:
|
| 156 |
+
ConfigParseError: If validation fails
|
| 157 |
+
"""
|
| 158 |
+
try:
|
| 159 |
+
# Convert dtype
|
| 160 |
+
if "dtype" in data and isinstance(data["dtype"], str):
|
| 161 |
+
data["dtype"] = cls._convert_dtype(data["dtype"])
|
| 162 |
+
|
| 163 |
+
# Convert optimizer
|
| 164 |
+
if "optimizer" in data and isinstance(data["optimizer"], str):
|
| 165 |
+
data["optimizer"] = cls._convert_optimizer(data["optimizer"])
|
| 166 |
+
|
| 167 |
+
return TrainingConfig(**data)
|
| 168 |
+
except ValidationError as e:
|
| 169 |
+
raise ConfigParseError("Invalid training configuration", e.errors()) from e
|
| 170 |
+
|
| 171 |
+
@classmethod
|
| 172 |
+
def parse_parallelism_config(cls, data: dict[str, Any]) -> ParallelismConfig:
|
| 173 |
+
"""Parse parallelism configuration from dict.
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
data: Dictionary with parallelism configuration
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
ParallelismConfig object
|
| 180 |
+
|
| 181 |
+
Raises:
|
| 182 |
+
ConfigParseError: If validation fails
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
return ParallelismConfig(**data)
|
| 186 |
+
except ValidationError as e:
|
| 187 |
+
raise ConfigParseError("Invalid parallelism configuration", e.errors()) from e
|
| 188 |
+
|
| 189 |
+
@classmethod
|
| 190 |
+
def parse_engine_config(cls, data: dict[str, Any]) -> EngineConfig:
|
| 191 |
+
"""Parse engine configuration from dict.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
data: Dictionary with engine configuration
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
EngineConfig object
|
| 198 |
+
|
| 199 |
+
Raises:
|
| 200 |
+
ConfigParseError: If validation fails
|
| 201 |
+
"""
|
| 202 |
+
try:
|
| 203 |
+
# Convert engine type
|
| 204 |
+
if "type" in data and isinstance(data["type"], str):
|
| 205 |
+
data["type"] = cls._convert_engine(data["type"])
|
| 206 |
+
|
| 207 |
+
# Convert offload options
|
| 208 |
+
if "offload_optimizer" in data and isinstance(data["offload_optimizer"], str):
|
| 209 |
+
data["offload_optimizer"] = cls._convert_offload(data["offload_optimizer"])
|
| 210 |
+
|
| 211 |
+
if "offload_param" in data and isinstance(data["offload_param"], str):
|
| 212 |
+
data["offload_param"] = cls._convert_offload(data["offload_param"])
|
| 213 |
+
|
| 214 |
+
return EngineConfig(**data)
|
| 215 |
+
except ValidationError as e:
|
| 216 |
+
raise ConfigParseError("Invalid engine configuration", e.errors()) from e
|
| 217 |
+
|
| 218 |
+
@classmethod
|
| 219 |
+
def parse_gpu_config(cls, data: dict[str, Any]) -> GPUConfig:
|
| 220 |
+
"""Parse GPU configuration from dict.
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
data: Dictionary with GPU configuration
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
GPUConfig object
|
| 227 |
+
|
| 228 |
+
Raises:
|
| 229 |
+
ConfigParseError: If validation fails
|
| 230 |
+
"""
|
| 231 |
+
try:
|
| 232 |
+
return GPUConfig(**data)
|
| 233 |
+
except ValidationError as e:
|
| 234 |
+
raise ConfigParseError("Invalid GPU configuration", e.errors()) from e
|
| 235 |
+
|
| 236 |
+
@classmethod
|
| 237 |
+
def parse_file(cls, config_path: str | Path) -> dict[str, Any]:
|
| 238 |
+
"""Parse configuration from JSON file.
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
config_path: Path to configuration file
|
| 242 |
+
|
| 243 |
+
Returns:
|
| 244 |
+
Dictionary with parsed configuration
|
| 245 |
+
|
| 246 |
+
Raises:
|
| 247 |
+
ConfigParseError: If file cannot be read or parsed
|
| 248 |
+
"""
|
| 249 |
+
path = Path(config_path)
|
| 250 |
+
if not path.exists():
|
| 251 |
+
raise ConfigParseError(f"Configuration file not found: {config_path}")
|
| 252 |
+
|
| 253 |
+
try:
|
| 254 |
+
with path.open("r") as f:
|
| 255 |
+
data = cast(dict[str, Any], json.load(f))
|
| 256 |
+
return data
|
| 257 |
+
except json.JSONDecodeError as e:
|
| 258 |
+
raise ConfigParseError(f"Invalid JSON in configuration file: {e}") from e
|
| 259 |
+
except Exception as e:
|
| 260 |
+
raise ConfigParseError(f"Error reading configuration file: {e}") from e
|
| 261 |
+
|
| 262 |
+
@classmethod
|
| 263 |
+
def parse_full_config(
|
| 264 |
+
cls,
|
| 265 |
+
config_path: str | Path,
|
| 266 |
+
) -> tuple[ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig]:
|
| 267 |
+
"""Parse complete configuration from file.
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
config_path: Path to configuration file
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
Tuple of (ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig)
|
| 274 |
+
|
| 275 |
+
Raises:
|
| 276 |
+
ConfigParseError: If validation fails
|
| 277 |
+
"""
|
| 278 |
+
data = cls.parse_file(config_path)
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
model_config = cls.parse_model_config(data.get("model", {}))
|
| 282 |
+
training_config = cls.parse_training_config(data.get("training", {}))
|
| 283 |
+
parallelism_config = cls.parse_parallelism_config(data.get("parallelism", {}))
|
| 284 |
+
engine_config = cls.parse_engine_config(data.get("engine", {}))
|
| 285 |
+
gpu_config = cls.parse_gpu_config(data.get("hardware", {}))
|
| 286 |
+
|
| 287 |
+
return (
|
| 288 |
+
model_config,
|
| 289 |
+
training_config,
|
| 290 |
+
parallelism_config,
|
| 291 |
+
engine_config,
|
| 292 |
+
gpu_config,
|
| 293 |
+
)
|
| 294 |
+
except ConfigParseError:
|
| 295 |
+
raise
|
| 296 |
+
except Exception as e:
|
| 297 |
+
raise ConfigParseError(f"Unexpected error parsing configuration: {e}") from e
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def load_config(config_path: str | Path) -> dict[str, Any]:
|
| 301 |
+
"""Load configuration from file.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
config_path: Path to configuration file
|
| 305 |
+
|
| 306 |
+
Returns:
|
| 307 |
+
Dictionary with configuration data
|
| 308 |
+
"""
|
| 309 |
+
return ConfigParser.parse_file(config_path)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def save_config(data: dict[str, Any], output_path: str | Path) -> None:
|
| 313 |
+
"""Save configuration to JSON file.
|
| 314 |
+
|
| 315 |
+
Args:
|
| 316 |
+
data: Configuration dictionary to save
|
| 317 |
+
output_path: Path to save configuration file
|
| 318 |
+
"""
|
| 319 |
+
path = Path(output_path)
|
| 320 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 321 |
+
|
| 322 |
+
with path.open("w") as f:
|
| 323 |
+
json.dump(data, f, indent=2)
|
src/gpu_mem_calculator/config/presets.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Preset model configurations loader.
|
| 2 |
+
|
| 3 |
+
This module provides a centralized location for managing model preset
|
| 4 |
+
configurations that can be used by both CLI and web interfaces.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, cast
|
| 10 |
+
|
| 11 |
+
# Base directory for the package
|
| 12 |
+
BASE_DIR = Path(__file__).parent.parent.parent.parent
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def get_presets_file_path() -> Path:
|
| 16 |
+
"""Get the path to the presets JSON file.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
Path to the presets JSON file
|
| 20 |
+
"""
|
| 21 |
+
# Check for web/presets/models.json relative to project root
|
| 22 |
+
presets_path = BASE_DIR / "web" / "presets" / "models.json"
|
| 23 |
+
if presets_path.exists():
|
| 24 |
+
return presets_path
|
| 25 |
+
|
| 26 |
+
# Fallback to src directory for development installs
|
| 27 |
+
presets_path = BASE_DIR / "src" / "gpu_mem_calculator" / "presets" / "models.json"
|
| 28 |
+
return presets_path
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def load_presets() -> dict[str, dict[str, Any]]:
|
| 32 |
+
"""Load all preset model configurations.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Dictionary mapping preset names to their configurations.
|
| 36 |
+
Each preset has: display_name, description, config
|
| 37 |
+
"""
|
| 38 |
+
presets_file = get_presets_file_path()
|
| 39 |
+
|
| 40 |
+
if not presets_file.exists():
|
| 41 |
+
return {}
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
with presets_file.open("r") as f:
|
| 45 |
+
return cast(dict[str, dict[str, Any]], json.load(f))
|
| 46 |
+
except (json.JSONDecodeError, OSError):
|
| 47 |
+
return {}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_preset_config(preset_name: str) -> dict[str, Any] | None:
|
| 51 |
+
"""Get a specific preset configuration.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
preset_name: Name of the preset to retrieve
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Preset configuration dict, or None if not found
|
| 58 |
+
"""
|
| 59 |
+
presets = load_presets()
|
| 60 |
+
preset = presets.get(preset_name)
|
| 61 |
+
|
| 62 |
+
if preset is None:
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
# Return just the config part (what the calculator needs)
|
| 66 |
+
return cast(dict[str, Any], preset.get("config", {}))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def list_presets() -> dict[str, dict[str, str]]:
|
| 70 |
+
"""List all available presets with metadata.
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Dictionary mapping preset names to their display metadata.
|
| 74 |
+
Each entry has: display_name, description
|
| 75 |
+
"""
|
| 76 |
+
presets = load_presets()
|
| 77 |
+
return {
|
| 78 |
+
name: {
|
| 79 |
+
"display_name": preset.get("display_name", name),
|
| 80 |
+
"description": preset.get("description", ""),
|
| 81 |
+
}
|
| 82 |
+
for name, preset in presets.items()
|
| 83 |
+
}
|
src/gpu_mem_calculator/core/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core memory calculation models and formulas."""
|
| 2 |
+
|
| 3 |
+
from gpu_mem_calculator.core.formulas import Precision
|
| 4 |
+
from gpu_mem_calculator.core.models import (
|
| 5 |
+
EngineConfig,
|
| 6 |
+
EngineType,
|
| 7 |
+
GPUConfig,
|
| 8 |
+
ModelConfig,
|
| 9 |
+
ParallelismConfig,
|
| 10 |
+
TrainingConfig,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"ModelConfig",
|
| 15 |
+
"TrainingConfig",
|
| 16 |
+
"ParallelismConfig",
|
| 17 |
+
"EngineConfig",
|
| 18 |
+
"EngineType",
|
| 19 |
+
"GPUConfig",
|
| 20 |
+
"Precision",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
# Import GPUMemoryCalculator separately to avoid circular import
|
| 24 |
+
# Use: from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
|
src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (562 Bytes). View file
|
|
|
src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc
ADDED
|
Binary file (6.51 kB). View file
|
|
|
src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc
ADDED
|
Binary file (7.29 kB). View file
|
|
|
src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (24.4 kB). View file
|
|
|
src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc
ADDED
|
Binary file (10.8 kB). View file
|
|
|
src/gpu_mem_calculator/core/calculator.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Main GPU memory calculator.
|
| 2 |
+
|
| 3 |
+
Orchestrates the memory calculation by selecting the appropriate
|
| 4 |
+
training engine and aggregating results.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from gpu_mem_calculator.config.parser import ConfigParser
|
| 8 |
+
from gpu_mem_calculator.core.models import (
|
| 9 |
+
EngineConfig,
|
| 10 |
+
EngineType,
|
| 11 |
+
GPUConfig,
|
| 12 |
+
MemoryResult,
|
| 13 |
+
ModelConfig,
|
| 14 |
+
NodeConfig,
|
| 15 |
+
ParallelismConfig,
|
| 16 |
+
TrainingConfig,
|
| 17 |
+
)
|
| 18 |
+
from gpu_mem_calculator.engines import (
|
| 19 |
+
DeepSpeedEngine,
|
| 20 |
+
FSDPEngine,
|
| 21 |
+
MegatronDeepSpeedEngine,
|
| 22 |
+
MegatronLMEngine,
|
| 23 |
+
PyTorchDDPEngine,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Type alias for engine types
|
| 27 |
+
EngineTypeAlias = (
|
| 28 |
+
PyTorchDDPEngine | DeepSpeedEngine | MegatronLMEngine | FSDPEngine | MegatronDeepSpeedEngine
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class GPUMemoryCalculator:
|
| 33 |
+
"""Main GPU memory calculator.
|
| 34 |
+
|
| 35 |
+
This class provides a high-level interface for calculating
|
| 36 |
+
GPU memory requirements for LLM training.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
model_config: ModelConfig,
|
| 42 |
+
training_config: TrainingConfig,
|
| 43 |
+
parallelism_config: ParallelismConfig | None = None,
|
| 44 |
+
engine_config: EngineConfig | None = None,
|
| 45 |
+
gpu_config: GPUConfig | None = None,
|
| 46 |
+
node_config: NodeConfig | None = None,
|
| 47 |
+
) -> None:
|
| 48 |
+
"""Initialize the calculator.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
model_config: Model architecture configuration
|
| 52 |
+
training_config: Training hyperparameters
|
| 53 |
+
parallelism_config: Parallelism settings (default: no parallelism)
|
| 54 |
+
engine_config: Training engine configuration (default: PyTorch DDP)
|
| 55 |
+
gpu_config: Hardware configuration (default: 1x 80GB GPU)
|
| 56 |
+
node_config: Multi-node configuration (default: single node)
|
| 57 |
+
"""
|
| 58 |
+
self.model_config = model_config
|
| 59 |
+
self.training_config = training_config
|
| 60 |
+
self.parallelism_config = parallelism_config or ParallelismConfig()
|
| 61 |
+
self.engine_config = engine_config or EngineConfig()
|
| 62 |
+
self.gpu_config = gpu_config or GPUConfig()
|
| 63 |
+
self.node_config = node_config or NodeConfig()
|
| 64 |
+
|
| 65 |
+
def calculate(self) -> MemoryResult:
|
| 66 |
+
"""Calculate GPU memory requirements.
|
| 67 |
+
|
| 68 |
+
Selects the appropriate training engine based on configuration
|
| 69 |
+
and returns the memory calculation result.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
MemoryResult with complete memory breakdown
|
| 73 |
+
"""
|
| 74 |
+
engine = self._get_engine()
|
| 75 |
+
return engine.calculate_memory()
|
| 76 |
+
|
| 77 |
+
def _get_engine(self) -> EngineTypeAlias:
|
| 78 |
+
"""Get the appropriate training engine instance.
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
Engine instance configured with current settings
|
| 82 |
+
"""
|
| 83 |
+
match self.engine_config.type:
|
| 84 |
+
case EngineType.PYTORCH_DDP:
|
| 85 |
+
return PyTorchDDPEngine(
|
| 86 |
+
model_config=self.model_config,
|
| 87 |
+
training_config=self.training_config,
|
| 88 |
+
parallelism_config=self.parallelism_config,
|
| 89 |
+
engine_config=self.engine_config,
|
| 90 |
+
gpu_config=self.gpu_config,
|
| 91 |
+
node_config=self.node_config,
|
| 92 |
+
)
|
| 93 |
+
case EngineType.DEEPSPEED:
|
| 94 |
+
return DeepSpeedEngine(
|
| 95 |
+
model_config=self.model_config,
|
| 96 |
+
training_config=self.training_config,
|
| 97 |
+
parallelism_config=self.parallelism_config,
|
| 98 |
+
engine_config=self.engine_config,
|
| 99 |
+
gpu_config=self.gpu_config,
|
| 100 |
+
node_config=self.node_config,
|
| 101 |
+
)
|
| 102 |
+
case EngineType.MEGATRON_LM:
|
| 103 |
+
return MegatronLMEngine(
|
| 104 |
+
model_config=self.model_config,
|
| 105 |
+
training_config=self.training_config,
|
| 106 |
+
parallelism_config=self.parallelism_config,
|
| 107 |
+
engine_config=self.engine_config,
|
| 108 |
+
gpu_config=self.gpu_config,
|
| 109 |
+
node_config=self.node_config,
|
| 110 |
+
)
|
| 111 |
+
case EngineType.FSDP:
|
| 112 |
+
return FSDPEngine(
|
| 113 |
+
model_config=self.model_config,
|
| 114 |
+
training_config=self.training_config,
|
| 115 |
+
parallelism_config=self.parallelism_config,
|
| 116 |
+
engine_config=self.engine_config,
|
| 117 |
+
gpu_config=self.gpu_config,
|
| 118 |
+
node_config=self.node_config,
|
| 119 |
+
)
|
| 120 |
+
case EngineType.MEGATRON_DEEPSPEED:
|
| 121 |
+
return MegatronDeepSpeedEngine(
|
| 122 |
+
model_config=self.model_config,
|
| 123 |
+
training_config=self.training_config,
|
| 124 |
+
parallelism_config=self.parallelism_config,
|
| 125 |
+
engine_config=self.engine_config,
|
| 126 |
+
gpu_config=self.gpu_config,
|
| 127 |
+
node_config=self.node_config,
|
| 128 |
+
)
|
| 129 |
+
case _:
|
| 130 |
+
# Default to PyTorch DDP
|
| 131 |
+
return PyTorchDDPEngine(
|
| 132 |
+
model_config=self.model_config,
|
| 133 |
+
training_config=self.training_config,
|
| 134 |
+
parallelism_config=self.parallelism_config,
|
| 135 |
+
engine_config=self.engine_config,
|
| 136 |
+
gpu_config=self.gpu_config,
|
| 137 |
+
node_config=self.node_config,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
@classmethod
|
| 141 |
+
def from_config_file(
|
| 142 |
+
cls,
|
| 143 |
+
config_path: str,
|
| 144 |
+
) -> "GPUMemoryCalculator":
|
| 145 |
+
"""Create calculator from configuration file.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
config_path: Path to JSON configuration file
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Configured GPUMemoryCalculator instance
|
| 152 |
+
"""
|
| 153 |
+
model_config, training_config, parallelism_config, engine_config, gpu_config = (
|
| 154 |
+
ConfigParser.parse_full_config(config_path)
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
return cls(
|
| 158 |
+
model_config=model_config,
|
| 159 |
+
training_config=training_config,
|
| 160 |
+
parallelism_config=parallelism_config,
|
| 161 |
+
engine_config=engine_config,
|
| 162 |
+
gpu_config=gpu_config,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
def to_dict(self) -> dict:
|
| 166 |
+
"""Export calculator configuration to dictionary.
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
Dictionary with all configuration
|
| 170 |
+
"""
|
| 171 |
+
return {
|
| 172 |
+
"model": self.model_config.model_dump(),
|
| 173 |
+
"training": self.training_config.model_dump(),
|
| 174 |
+
"parallelism": self.parallelism_config.model_dump(),
|
| 175 |
+
"engine": self.engine_config.model_dump(),
|
| 176 |
+
"hardware": self.gpu_config.model_dump(),
|
| 177 |
+
"multinode": self.node_config.model_dump(),
|
| 178 |
+
}
|
src/gpu_mem_calculator/core/formulas.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Memory calculation formulas.
|
| 2 |
+
|
| 3 |
+
This module contains the fundamental formulas for calculating GPU memory
|
| 4 |
+
requirements for LLM training.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class Precision:
|
| 12 |
+
"""Precision information for a data type.
|
| 13 |
+
|
| 14 |
+
This is re-exported from utils.precision for convenience.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
name: str
|
| 18 |
+
bits_per_param: int
|
| 19 |
+
bytes_per_param: float
|
| 20 |
+
is_integer: bool = False
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def calculate_parameter_memory(
|
| 24 |
+
num_params: int,
|
| 25 |
+
dtype: str,
|
| 26 |
+
num_gpus: int = 1,
|
| 27 |
+
) -> float:
|
| 28 |
+
"""Calculate memory in GB for model parameters.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
num_params: Number of model parameters
|
| 32 |
+
dtype: Data type (e.g., "fp32", "fp16", "bf16", "int8", "int4")
|
| 33 |
+
num_gpus: Number of GPUs for distribution
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
Memory in GB
|
| 37 |
+
"""
|
| 38 |
+
from gpu_mem_calculator.utils.precision import gb_from_params
|
| 39 |
+
|
| 40 |
+
# Parameters are distributed across GPUs in data parallel training
|
| 41 |
+
# But for tensor/pipeline parallel, each GPU holds a portion
|
| 42 |
+
# We'll handle parallelism in the engine implementations
|
| 43 |
+
return gb_from_params(num_params, dtype)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def calculate_gradient_memory(
|
| 47 |
+
num_params: int,
|
| 48 |
+
dtype: str,
|
| 49 |
+
) -> float:
|
| 50 |
+
"""Calculate memory in GB for gradients.
|
| 51 |
+
|
| 52 |
+
Gradients are typically stored in the same precision as parameters
|
| 53 |
+
for training (though updated in FP32).
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
num_params: Number of model parameters
|
| 57 |
+
dtype: Data type for gradients
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Memory in GB
|
| 61 |
+
"""
|
| 62 |
+
from gpu_mem_calculator.utils.precision import gb_from_params
|
| 63 |
+
|
| 64 |
+
# Gradients are same size as parameters during training
|
| 65 |
+
return gb_from_params(num_params, dtype)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def calculate_optimizer_memory(
|
| 69 |
+
num_params: int,
|
| 70 |
+
optimizer: str,
|
| 71 |
+
) -> float:
|
| 72 |
+
"""Calculate memory in GB for optimizer states.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
num_params: Number of model parameters
|
| 76 |
+
optimizer: Optimizer type (adam, adamw, sgd, adamw_8bit)
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Memory in GB (for FP32 optimizer states)
|
| 80 |
+
"""
|
| 81 |
+
from gpu_mem_calculator.utils.precision import gb_from_bytes
|
| 82 |
+
|
| 83 |
+
# Optimizer states are typically stored in FP32
|
| 84 |
+
# bytes_per_param = 4.0 # FP32
|
| 85 |
+
|
| 86 |
+
match optimizer.lower():
|
| 87 |
+
case "adam" | "adamw":
|
| 88 |
+
# Adam/AdamW optimizer states: 12 bytes per param
|
| 89 |
+
# - FP32 parameter copy: 4 bytes
|
| 90 |
+
# - Momentum (fp32): 4 bytes
|
| 91 |
+
# - Variance (fp32): 4 bytes
|
| 92 |
+
# Reference: https://blog.eleuther.ai/transformer-math/#optimizer-states
|
| 93 |
+
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
| 94 |
+
# Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
|
| 95 |
+
optimizer_bytes_per_param = 12.0
|
| 96 |
+
case "adamw_8bit":
|
| 97 |
+
# 8-bit Adam: ~2 bytes per param (quantized states)
|
| 98 |
+
# Reference: bitsandbytes 8-bit optimizer
|
| 99 |
+
optimizer_bytes_per_param = 2.0
|
| 100 |
+
case "sgd":
|
| 101 |
+
# SGD: momentum (4 bytes) if using momentum, 0 if not
|
| 102 |
+
# Assuming momentum is used
|
| 103 |
+
optimizer_bytes_per_param = 4.0
|
| 104 |
+
case _:
|
| 105 |
+
# Default to Adam
|
| 106 |
+
optimizer_bytes_per_param = 12.0
|
| 107 |
+
|
| 108 |
+
total_bytes = num_params * optimizer_bytes_per_param
|
| 109 |
+
return gb_from_bytes(total_bytes)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def calculate_activation_memory(
|
| 113 |
+
batch_size: int,
|
| 114 |
+
seq_len: int,
|
| 115 |
+
hidden_size: int,
|
| 116 |
+
num_layers: int,
|
| 117 |
+
num_attention_heads: int,
|
| 118 |
+
tensor_parallel_size: int = 1,
|
| 119 |
+
activation_checkpointing: int = 0,
|
| 120 |
+
moe_enabled: bool = False,
|
| 121 |
+
num_experts: int = 1,
|
| 122 |
+
top_k: int = 1,
|
| 123 |
+
expert_intermediate_size: int | None = None,
|
| 124 |
+
) -> float:
|
| 125 |
+
"""Calculate approximate memory in GB for activations.
|
| 126 |
+
|
| 127 |
+
This provides an estimate based on transformer architecture. Actual
|
| 128 |
+
activation memory depends on many factors including the specific
|
| 129 |
+
model implementation and framework.
|
| 130 |
+
|
| 131 |
+
Reference: https://blog.eleuther.ai/transformer-math/#activations
|
| 132 |
+
Reference: https://arxiv.org/abs/2204.13323 ("Reducing Activation Recomputation
|
| 133 |
+
in Large Transformer Models")
|
| 134 |
+
|
| 135 |
+
According to EleutherAI Transformer Math 101, for selective activation
|
| 136 |
+
checkpointing (the most common approach), the formula is:
|
| 137 |
+
|
| 138 |
+
sbhL(10 + 24/t) bytes
|
| 139 |
+
|
| 140 |
+
Where:
|
| 141 |
+
- s = sequence length (seq_len)
|
| 142 |
+
- b = batch size (batch_size)
|
| 143 |
+
- h = hidden size (hidden_size)
|
| 144 |
+
- L = number of layers (num_layers)
|
| 145 |
+
- t = tensor parallel size (tensor_parallel_size)
|
| 146 |
+
|
| 147 |
+
This implementation uses a simplified heuristic that approximates
|
| 148 |
+
this formula: hidden_size * 16 bytes per token per layer. This
|
| 149 |
+
provides a reasonable estimate for typical model configurations
|
| 150 |
+
while being simple to understand and modify.
|
| 151 |
+
|
| 152 |
+
For MoE models, activation memory is reduced because only top_k experts
|
| 153 |
+
are active per token, not all experts.
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
batch_size: Batch size per GPU
|
| 157 |
+
seq_len: Sequence length
|
| 158 |
+
hidden_size: Hidden dimension size
|
| 159 |
+
num_layers: Number of transformer layers
|
| 160 |
+
num_attention_heads: Number of attention heads
|
| 161 |
+
tensor_parallel_size: Tensor parallelism degree
|
| 162 |
+
activation_checkpointing: Checkpointing level (0-4)
|
| 163 |
+
moe_enabled: Whether model uses Mixture of Experts
|
| 164 |
+
num_experts: Total number of experts (for MoE)
|
| 165 |
+
top_k: Number of active experts per token (for MoE)
|
| 166 |
+
expert_intermediate_size: Expert intermediate layer size (for MoE)
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
Memory in GB
|
| 170 |
+
"""
|
| 171 |
+
from gpu_mem_calculator.utils.precision import gb_from_bytes
|
| 172 |
+
|
| 173 |
+
# Approximate activation memory per token per layer
|
| 174 |
+
# Based on EleutherAI formula: sbhL(10 + 24/t)
|
| 175 |
+
# For t=1: ~10-24 bytes per token per layer depending on architecture
|
| 176 |
+
# We use 16 as a middle-ground estimate
|
| 177 |
+
# This includes attention outputs, MLP activations, layer norms, etc.
|
| 178 |
+
|
| 179 |
+
bytes_per_token_per_layer = hidden_size * 16 # Heuristic estimate
|
| 180 |
+
|
| 181 |
+
# For MoE models, adjust activation memory based on active experts
|
| 182 |
+
moe_multiplier = 1.0
|
| 183 |
+
if moe_enabled and num_experts > 1:
|
| 184 |
+
# Only top_k experts are active per token
|
| 185 |
+
# Base ratio of active experts
|
| 186 |
+
expert_ratio = top_k / num_experts
|
| 187 |
+
|
| 188 |
+
# Add router overhead (gating network activations)
|
| 189 |
+
router_overhead = 0.1
|
| 190 |
+
|
| 191 |
+
moe_multiplier = min(1.0, expert_ratio + router_overhead)
|
| 192 |
+
|
| 193 |
+
# For MoE, experts typically have larger intermediate sizes
|
| 194 |
+
if moe_enabled and expert_intermediate_size:
|
| 195 |
+
# Scale up slightly for larger expert intermediate layers
|
| 196 |
+
# Typical expert intermediate size is 4x hidden_size (vs 2x for dense)
|
| 197 |
+
size_ratio = expert_intermediate_size / (hidden_size * 2)
|
| 198 |
+
moe_multiplier *= min(2.0, size_ratio) # Cap at 2x increase
|
| 199 |
+
|
| 200 |
+
# Total activation memory
|
| 201 |
+
total_bytes = (
|
| 202 |
+
batch_size
|
| 203 |
+
* seq_len
|
| 204 |
+
* num_layers
|
| 205 |
+
* bytes_per_token_per_layer
|
| 206 |
+
* moe_multiplier
|
| 207 |
+
/ tensor_parallel_size
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Adjust for activation checkpointing
|
| 211 |
+
# Level 0: No checkpointing (100% memory)
|
| 212 |
+
# Level 1: Checkpoint attention output (~80% memory)
|
| 213 |
+
# Level 2: Checkpoint attention input (~60% memory)
|
| 214 |
+
# Level 3: Checkpoint more (~40% memory)
|
| 215 |
+
# Level 4: Full checkpointing (~20% memory)
|
| 216 |
+
checkpoint_factors = [1.0, 0.8, 0.6, 0.4, 0.2]
|
| 217 |
+
checkpoint_factor = checkpoint_factors[min(activation_checkpointing, 4)]
|
| 218 |
+
|
| 219 |
+
total_bytes *= checkpoint_factor
|
| 220 |
+
|
| 221 |
+
return gb_from_bytes(total_bytes)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def calculate_overhead(
|
| 225 |
+
total_memory: float,
|
| 226 |
+
overhead_factor: float = 0.2,
|
| 227 |
+
) -> float:
|
| 228 |
+
"""Calculate additional memory overhead.
|
| 229 |
+
|
| 230 |
+
This accounts for CUDA context, fragmentation, temporary buffers, etc.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
total_memory: Total calculated memory in GB
|
| 234 |
+
overhead_factor: Fraction to add for overhead (default 20%)
|
| 235 |
+
|
| 236 |
+
Returns:
|
| 237 |
+
Overhead memory in GB
|
| 238 |
+
"""
|
| 239 |
+
return total_memory * overhead_factor
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def estimate_largest_layer_params(
|
| 243 |
+
hidden_size: int,
|
| 244 |
+
num_attention_heads: int,
|
| 245 |
+
intermediate_size: int | None = None,
|
| 246 |
+
) -> int:
|
| 247 |
+
"""Estimate the largest layer parameters for ZeRO-3 calculations.
|
| 248 |
+
|
| 249 |
+
The largest layer is typically the MLP layer or attention projection.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
hidden_size: Hidden dimension size
|
| 253 |
+
num_attention_heads: Number of attention heads
|
| 254 |
+
intermediate_size: MLP intermediate size (default 4 * hidden_size)
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
Estimated number of parameters in the largest layer
|
| 258 |
+
"""
|
| 259 |
+
if intermediate_size is None:
|
| 260 |
+
intermediate_size = 4 * hidden_size
|
| 261 |
+
|
| 262 |
+
# MLP layer: hidden_size * intermediate_size * 2 (for up and down projections)
|
| 263 |
+
mlp_params = hidden_size * intermediate_size * 2
|
| 264 |
+
|
| 265 |
+
# Attention output projection: hidden_size * hidden_size
|
| 266 |
+
attn_params = hidden_size * hidden_size
|
| 267 |
+
|
| 268 |
+
return max(mlp_params, attn_params)
|
src/gpu_mem_calculator/core/models.py
ADDED
|
@@ -0,0 +1,568 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for GPU memory calculation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import Literal, cast
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
| 9 |
+
from pydantic_core.core_schema import ValidationInfo as FieldValidationInfo
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class EngineType(str, Enum):
|
| 13 |
+
"""Supported training engine types."""
|
| 14 |
+
|
| 15 |
+
PYTORCH_DDP = "pytorch_ddp"
|
| 16 |
+
DEEPSPEED = "deepspeed"
|
| 17 |
+
MEGATRON_LM = "megatron_lm"
|
| 18 |
+
FSDP = "fsdp"
|
| 19 |
+
MEGATRON_DEEPSPEED = "megatron_deepspeed"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class InferenceEngineType(str, Enum):
|
| 23 |
+
"""Supported inference engine types."""
|
| 24 |
+
|
| 25 |
+
HUGGINGFACE = "huggingface"
|
| 26 |
+
VLLM = "vllm"
|
| 27 |
+
TGI = "tgi"
|
| 28 |
+
TENSORRT_LLM = "tensorrt_llm"
|
| 29 |
+
TRTLLM = "trtllm"
|
| 30 |
+
SGLANG = "sglang"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class OptimizerType(str, Enum):
|
| 34 |
+
"""Supported optimizer types."""
|
| 35 |
+
|
| 36 |
+
ADAM = "adam"
|
| 37 |
+
ADAMW = "adamw"
|
| 38 |
+
SGD = "sgd"
|
| 39 |
+
ADAMW_8BIT = "adamw_8bit"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class DType(str, Enum):
|
| 43 |
+
"""Supported data types."""
|
| 44 |
+
|
| 45 |
+
FP32 = "fp32"
|
| 46 |
+
FP16 = "fp16"
|
| 47 |
+
BF16 = "bf16"
|
| 48 |
+
INT8 = "int8"
|
| 49 |
+
INT4 = "int4"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class OffloadDevice(str, Enum):
|
| 53 |
+
"""CPU offload options."""
|
| 54 |
+
|
| 55 |
+
NONE = "none"
|
| 56 |
+
CPU = "cpu"
|
| 57 |
+
NVME = "nvme"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class ModelConfig(BaseModel):
|
| 61 |
+
"""Model architecture configuration."""
|
| 62 |
+
|
| 63 |
+
name: str = Field(default="custom", description="Model name")
|
| 64 |
+
num_parameters: int = Field(gt=0, description="Total number of parameters")
|
| 65 |
+
num_layers: int = Field(gt=0, description="Number of transformer layers")
|
| 66 |
+
hidden_size: int = Field(gt=0, description="Hidden dimension size")
|
| 67 |
+
num_attention_heads: int = Field(gt=0, description="Number of attention heads")
|
| 68 |
+
vocab_size: int = Field(default=32000, gt=0, description="Vocabulary size")
|
| 69 |
+
max_seq_len: int = Field(default=2048, gt=0, description="Maximum sequence length")
|
| 70 |
+
largest_layer_params: int | None = Field(
|
| 71 |
+
default=None,
|
| 72 |
+
gt=0,
|
| 73 |
+
description="Largest layer parameters (auto-calculated if not provided)",
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# MoE (Mixture of Experts) parameters
|
| 77 |
+
moe_enabled: bool = Field(default=False, description="Enable Mixture of Experts")
|
| 78 |
+
num_experts: int = Field(default=8, ge=1, description="Number of experts in MoE")
|
| 79 |
+
top_k: int = Field(default=2, ge=1, description="Number of experts activated per token (top-k)")
|
| 80 |
+
expert_intermediate_size: int | None = Field(
|
| 81 |
+
default=None,
|
| 82 |
+
gt=0,
|
| 83 |
+
description="Expert intermediate layer size (defaults to 4x hidden_size)",
|
| 84 |
+
)
|
| 85 |
+
shared_expert_intermediate_size: int | None = Field(
|
| 86 |
+
default=None,
|
| 87 |
+
gt=0,
|
| 88 |
+
description="Shared expert intermediate size (for models like GLM with shared experts)",
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
@model_validator(mode="after")
|
| 92 |
+
def calculate_largest_layer(self) -> ModelConfig:
|
| 93 |
+
"""Calculate largest layer params if not provided."""
|
| 94 |
+
if self.largest_layer_params is not None:
|
| 95 |
+
return self
|
| 96 |
+
# Calculate it
|
| 97 |
+
hidden = self.hidden_size
|
| 98 |
+
moe_enabled = self.moe_enabled
|
| 99 |
+
|
| 100 |
+
if hidden and moe_enabled:
|
| 101 |
+
# For MoE: largest layer includes expert parameters
|
| 102 |
+
expert_intermediate = self.expert_intermediate_size or hidden * 4
|
| 103 |
+
self.largest_layer_params = int(hidden * expert_intermediate * 2)
|
| 104 |
+
elif hidden:
|
| 105 |
+
# Dense model: attention output + MLP
|
| 106 |
+
self.largest_layer_params = int(hidden * hidden * 4)
|
| 107 |
+
return self
|
| 108 |
+
|
| 109 |
+
@property
|
| 110 |
+
def effective_num_experts(self) -> int:
|
| 111 |
+
"""Get effective number of experts (returns 1 if MoE disabled)."""
|
| 112 |
+
return self.num_experts if self.moe_enabled else 1
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def active_experts(self) -> int:
|
| 116 |
+
"""Get number of active experts per token (top_k or 1 if dense)."""
|
| 117 |
+
return self.top_k if self.moe_enabled else 1
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class TrainingConfig(BaseModel):
|
| 121 |
+
"""Training hyperparameters configuration."""
|
| 122 |
+
|
| 123 |
+
batch_size: int = Field(default=1, gt=0, description="Batch size per GPU")
|
| 124 |
+
gradient_accumulation_steps: int = Field(
|
| 125 |
+
default=1,
|
| 126 |
+
gt=0,
|
| 127 |
+
description="Gradient accumulation steps",
|
| 128 |
+
)
|
| 129 |
+
optimizer: OptimizerType = Field(default=OptimizerType.ADAMW, description="Optimizer type")
|
| 130 |
+
dtype: DType = Field(default=DType.BF16, description="Data type for training")
|
| 131 |
+
activation_checkpointing: int = Field(
|
| 132 |
+
default=0,
|
| 133 |
+
ge=0,
|
| 134 |
+
le=4,
|
| 135 |
+
description="Activation checkpointing level (0-4)",
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
@property
|
| 139 |
+
def effective_batch_size(self) -> int:
|
| 140 |
+
"""Calculate effective batch size with gradient accumulation."""
|
| 141 |
+
return self.batch_size * self.gradient_accumulation_steps
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class ParallelismConfig(BaseModel):
|
| 145 |
+
"""Parallelism configuration."""
|
| 146 |
+
|
| 147 |
+
tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
|
| 148 |
+
pipeline_parallel_size: int = Field(default=1, ge=1, description="Pipeline parallelism degree")
|
| 149 |
+
data_parallel_size: int = Field(default=1, ge=1, description="Data parallelism degree")
|
| 150 |
+
sequence_parallel: bool = Field(default=False, description="Enable sequence parallelism")
|
| 151 |
+
|
| 152 |
+
@property
|
| 153 |
+
def total_parallel_size(self) -> int:
|
| 154 |
+
"""Calculate total parallelism degree."""
|
| 155 |
+
return self.tensor_parallel_size * self.pipeline_parallel_size * self.data_parallel_size
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class EngineConfig(BaseModel):
|
| 159 |
+
"""Training engine specific configuration."""
|
| 160 |
+
|
| 161 |
+
type: EngineType = Field(default=EngineType.PYTORCH_DDP, description="Training engine type")
|
| 162 |
+
zero_stage: int | None = Field(
|
| 163 |
+
default=None,
|
| 164 |
+
ge=0,
|
| 165 |
+
le=3,
|
| 166 |
+
description="DeepSpeed ZeRO stage (only for DeepSpeed engine)",
|
| 167 |
+
)
|
| 168 |
+
offload_optimizer: OffloadDevice = Field(
|
| 169 |
+
default=OffloadDevice.NONE,
|
| 170 |
+
description="CPU offload for optimizer states",
|
| 171 |
+
)
|
| 172 |
+
offload_param: OffloadDevice = Field(
|
| 173 |
+
default=OffloadDevice.NONE,
|
| 174 |
+
description="CPU offload for parameters",
|
| 175 |
+
)
|
| 176 |
+
zero_init: bool = Field(
|
| 177 |
+
default=True,
|
| 178 |
+
description="Use ZeRO initialization (only for DeepSpeed ZeRO-3)",
|
| 179 |
+
)
|
| 180 |
+
sharding_strategy: Literal["no_shard", "shard_grad_op", "full_shard"] = Field(
|
| 181 |
+
default="full_shard",
|
| 182 |
+
description="FSDP sharding strategy",
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class GPUConfig(BaseModel):
|
| 187 |
+
"""Hardware configuration."""
|
| 188 |
+
|
| 189 |
+
num_gpus: int = Field(default=1, ge=1, description="Number of GPUs")
|
| 190 |
+
gpu_memory_gb: float = Field(default=80.0, gt=0, description="GPU memory in GB")
|
| 191 |
+
total_gpu_memory_gb: float | None = Field(
|
| 192 |
+
default=None,
|
| 193 |
+
description="Total GPU memory (calculated if not provided)",
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
@field_validator("total_gpu_memory_gb")
|
| 197 |
+
@classmethod
|
| 198 |
+
def calculate_total_memory(cls, v: float | None, info: FieldValidationInfo) -> float | None:
|
| 199 |
+
"""Calculate total GPU memory if not provided."""
|
| 200 |
+
if v is None:
|
| 201 |
+
num_gpus = cast(int, info.data.get("num_gpus", 1))
|
| 202 |
+
gpu_mem = cast(float, info.data.get("gpu_memory_gb", 80.0))
|
| 203 |
+
return num_gpus * gpu_mem
|
| 204 |
+
return v
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class InterconnectType(str, Enum):
|
| 208 |
+
"""Multi-node interconnect types."""
|
| 209 |
+
|
| 210 |
+
INFINIBAND = "infiniband"
|
| 211 |
+
NVLINK = "nvlink"
|
| 212 |
+
ETHERNET_10G = "ethernet_10g"
|
| 213 |
+
ETHERNET_25G = "ethernet_25g"
|
| 214 |
+
ETHERNET_100G = "ethernet_100g"
|
| 215 |
+
ETHERNET_200G = "ethernet_200g"
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
class NodeConfig(BaseModel):
|
| 219 |
+
"""Multi-node configuration."""
|
| 220 |
+
|
| 221 |
+
num_nodes: int = Field(default=1, ge=1, description="Number of nodes")
|
| 222 |
+
gpus_per_node: int | None = Field(
|
| 223 |
+
default=None,
|
| 224 |
+
ge=1,
|
| 225 |
+
description="GPUs per node (calculated from num_gpus if not provided)",
|
| 226 |
+
)
|
| 227 |
+
interconnect_type: InterconnectType = Field(
|
| 228 |
+
default=InterconnectType.INFINIBAND,
|
| 229 |
+
description="Interconnect type between nodes",
|
| 230 |
+
)
|
| 231 |
+
interconnect_bandwidth_gbps: float | None = Field(
|
| 232 |
+
default=None,
|
| 233 |
+
gt=0,
|
| 234 |
+
description="Interconnect bandwidth in Gbps (default: auto from type)",
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
@field_validator("gpus_per_node")
|
| 238 |
+
@classmethod
|
| 239 |
+
def calculate_gpus_per_node(cls, v: int | None, info: FieldValidationInfo) -> int | None:
|
| 240 |
+
"""Calculate GPUs per node if not provided."""
|
| 241 |
+
if v is None:
|
| 242 |
+
num_nodes = cast(int, info.data.get("num_nodes", 1))
|
| 243 |
+
num_gpus = cast(int, info.data.get("num_gpus", 1))
|
| 244 |
+
return max(1, num_gpus // num_nodes)
|
| 245 |
+
return v
|
| 246 |
+
|
| 247 |
+
def get_interconnect_bandwidth_gbps(self) -> float:
|
| 248 |
+
"""Get interconnect bandwidth in Gbps.
|
| 249 |
+
|
| 250 |
+
Returns bandwidth from config or default based on interconnect type.
|
| 251 |
+
"""
|
| 252 |
+
if self.interconnect_bandwidth_gbps:
|
| 253 |
+
return self.interconnect_bandwidth_gbps
|
| 254 |
+
|
| 255 |
+
# Default bandwidth values for each interconnect type
|
| 256 |
+
bandwidth_defaults = {
|
| 257 |
+
InterconnectType.INFINIBAND: 200.0, # HDR200 InfiniBand
|
| 258 |
+
InterconnectType.NVLINK: 300.0, # NVLink/NVSwitch
|
| 259 |
+
InterconnectType.ETHERNET_10G: 10.0,
|
| 260 |
+
InterconnectType.ETHERNET_25G: 25.0,
|
| 261 |
+
InterconnectType.ETHERNET_100G: 100.0,
|
| 262 |
+
InterconnectType.ETHERNET_200G: 200.0,
|
| 263 |
+
}
|
| 264 |
+
return bandwidth_defaults.get(self.interconnect_type, 100.0)
|
| 265 |
+
|
| 266 |
+
@property
|
| 267 |
+
def is_multi_node(self) -> bool:
|
| 268 |
+
"""Check if this is a multi-node configuration."""
|
| 269 |
+
return self.num_nodes > 1
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
class NetworkOverhead(BaseModel):
|
| 273 |
+
"""Network communication overhead for multi-node training."""
|
| 274 |
+
|
| 275 |
+
allreduce_gb: float = Field(default=0.0, ge=0, description="AllReduce communication in GB")
|
| 276 |
+
allgather_gb: float = Field(default=0.0, ge=0, description="AllGather communication in GB")
|
| 277 |
+
reducescatter_gb: float = Field(
|
| 278 |
+
default=0.0, ge=0, description="ReduceScatter communication in GB"
|
| 279 |
+
)
|
| 280 |
+
point_to_point_gb: float = Field(
|
| 281 |
+
default=0.0, ge=0, description="Point-to-point communication in GB"
|
| 282 |
+
)
|
| 283 |
+
total_overhead_gb: float = Field(default=0.0, ge=0, description="Total network overhead in GB")
|
| 284 |
+
estimated_overhead_ms_per_step: float | None = Field(
|
| 285 |
+
default=None,
|
| 286 |
+
description="Estimated communication overhead per training step in milliseconds",
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
class HybridParallelismConfig(BaseModel):
|
| 291 |
+
"""Hybrid parallelism configuration for optimal multi-node scaling."""
|
| 292 |
+
|
| 293 |
+
auto_optimize: bool = Field(
|
| 294 |
+
default=False,
|
| 295 |
+
description="Automatically optimize parallelism strategy for given hardware",
|
| 296 |
+
)
|
| 297 |
+
target_gpu_utilization: float = Field(
|
| 298 |
+
default=0.85,
|
| 299 |
+
gt=0.0,
|
| 300 |
+
le=1.0,
|
| 301 |
+
description="Target GPU memory utilization (0.0-1.0)",
|
| 302 |
+
)
|
| 303 |
+
prefer_pipeline_parallel: bool = Field(
|
| 304 |
+
default=False,
|
| 305 |
+
description="Prefer pipeline parallelism over data parallel for multi-node",
|
| 306 |
+
)
|
| 307 |
+
max_pipeline_chunks: int | None = Field(
|
| 308 |
+
default=None,
|
| 309 |
+
ge=1,
|
| 310 |
+
description="Maximum number of pipeline chunks (virtual stages)",
|
| 311 |
+
)
|
| 312 |
+
enable_sequence_parallel: bool = Field(
|
| 313 |
+
default=True,
|
| 314 |
+
description="Enable sequence parallelism for long sequences",
|
| 315 |
+
)
|
| 316 |
+
sequence_parallel_threshold: int = Field(
|
| 317 |
+
default=4096,
|
| 318 |
+
ge=1,
|
| 319 |
+
description="Sequence length threshold for enabling sequence parallel",
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
class MemoryBreakdown(BaseModel):
|
| 324 |
+
"""Memory calculation result breakdown."""
|
| 325 |
+
|
| 326 |
+
model_config = ConfigDict(protected_namespaces=())
|
| 327 |
+
|
| 328 |
+
model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
|
| 329 |
+
gradients_gb: float = Field(ge=0, description="Gradients memory in GB")
|
| 330 |
+
optimizer_states_gb: float = Field(ge=0, description="Optimizer states memory in GB")
|
| 331 |
+
activations_gb: float = Field(ge=0, description="Activations memory in GB")
|
| 332 |
+
overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
|
| 333 |
+
|
| 334 |
+
@property
|
| 335 |
+
def total_memory_gb(self) -> float:
|
| 336 |
+
"""Total memory in GB."""
|
| 337 |
+
return (
|
| 338 |
+
self.model_params_gb
|
| 339 |
+
+ self.gradients_gb
|
| 340 |
+
+ self.optimizer_states_gb
|
| 341 |
+
+ self.activations_gb
|
| 342 |
+
+ self.overhead_gb
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
class MemoryResult(BaseModel):
|
| 347 |
+
"""Complete memory calculation result."""
|
| 348 |
+
|
| 349 |
+
total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
|
| 350 |
+
total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
|
| 351 |
+
cpu_memory_gb: float = Field(default=0.0, ge=0, description="CPU memory required in GB")
|
| 352 |
+
breakdown: MemoryBreakdown = Field(description="Memory breakdown by component")
|
| 353 |
+
network_overhead: NetworkOverhead | None = Field(
|
| 354 |
+
default=None,
|
| 355 |
+
description="Network communication overhead for multi-node training",
|
| 356 |
+
)
|
| 357 |
+
fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
|
| 358 |
+
memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
|
| 359 |
+
recommended_batch_size: int | None = Field(
|
| 360 |
+
default=None,
|
| 361 |
+
description="Recommended batch size if current doesn't fit",
|
| 362 |
+
)
|
| 363 |
+
multi_node_info: dict | None = Field(
|
| 364 |
+
default=None,
|
| 365 |
+
description="Additional multi-node configuration info",
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
class KVCacheQuantization(str, Enum):
|
| 370 |
+
"""KV cache quantization options."""
|
| 371 |
+
|
| 372 |
+
NONE = "none"
|
| 373 |
+
INT8 = "int8"
|
| 374 |
+
FP8 = "fp8"
|
| 375 |
+
INT4 = "int4"
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
class InferenceMemoryBreakdown(BaseModel):
|
| 379 |
+
"""Memory breakdown for inference workloads."""
|
| 380 |
+
|
| 381 |
+
model_config = ConfigDict(protected_namespaces=())
|
| 382 |
+
|
| 383 |
+
model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
|
| 384 |
+
kv_cache_gb: float = Field(ge=0, description="KV cache memory in GB")
|
| 385 |
+
activations_gb: float = Field(ge=0, description="Activation memory in GB")
|
| 386 |
+
overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
|
| 387 |
+
|
| 388 |
+
@property
|
| 389 |
+
def total_memory_gb(self) -> float:
|
| 390 |
+
"""Total memory in GB."""
|
| 391 |
+
return self.model_params_gb + self.kv_cache_gb + self.activations_gb + self.overhead_gb
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
class InferenceConfig(BaseModel):
|
| 395 |
+
"""Inference-specific configuration."""
|
| 396 |
+
|
| 397 |
+
batch_size: int = Field(default=1, gt=0, description="Batch size for inference")
|
| 398 |
+
max_seq_len: int | None = Field(
|
| 399 |
+
default=None,
|
| 400 |
+
gt=0,
|
| 401 |
+
description="Override max sequence length for inference (default: use model config)",
|
| 402 |
+
)
|
| 403 |
+
kv_cache_quantization: KVCacheQuantization = Field(
|
| 404 |
+
default=KVCacheQuantization.NONE,
|
| 405 |
+
description="KV cache quantization type",
|
| 406 |
+
)
|
| 407 |
+
use_kv_cache: bool = Field(default=True, description="Enable KV cache for generation")
|
| 408 |
+
tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
|
| 409 |
+
enable_streaming: bool = Field(default=False, description="Enable streaming inference")
|
| 410 |
+
|
| 411 |
+
# Common inference options
|
| 412 |
+
gpu_memory_utilization: float = Field(
|
| 413 |
+
default=0.9,
|
| 414 |
+
gt=0.0,
|
| 415 |
+
le=1.0,
|
| 416 |
+
description="GPU memory utilization target (0.0-1.0)",
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# TGI-specific options
|
| 420 |
+
max_total_tokens: int | None = Field(
|
| 421 |
+
default=None,
|
| 422 |
+
gt=0,
|
| 423 |
+
description="TGI: Maximum total tokens (input + output) - defines memory budget",
|
| 424 |
+
)
|
| 425 |
+
max_input_tokens: int | None = Field(
|
| 426 |
+
default=None,
|
| 427 |
+
gt=0,
|
| 428 |
+
description="TGI: Maximum input tokens",
|
| 429 |
+
)
|
| 430 |
+
max_batch_total_tokens: int | None = Field(
|
| 431 |
+
default=None,
|
| 432 |
+
gt=0,
|
| 433 |
+
description="TGI: Maximum total tokens across all batches",
|
| 434 |
+
)
|
| 435 |
+
tgi_quantize: Literal[
|
| 436 |
+
"none",
|
| 437 |
+
"awq",
|
| 438 |
+
"eetq",
|
| 439 |
+
"exl2",
|
| 440 |
+
"gptq",
|
| 441 |
+
"marlin",
|
| 442 |
+
"bitsandbytes",
|
| 443 |
+
"bitsandbytes-nf4",
|
| 444 |
+
"bitsandbytes-fp4",
|
| 445 |
+
"fp8",
|
| 446 |
+
] = Field(
|
| 447 |
+
default="none",
|
| 448 |
+
description="TGI: Weight quantization method",
|
| 449 |
+
)
|
| 450 |
+
tgi_dtype: Literal["float16", "bfloat16"] = Field(
|
| 451 |
+
default="bfloat16",
|
| 452 |
+
description="TGI: Data type for inference",
|
| 453 |
+
)
|
| 454 |
+
sharded: bool = Field(default=False, description="TGI: Enable sharded inference")
|
| 455 |
+
num_shard: int | None = Field(
|
| 456 |
+
default=None,
|
| 457 |
+
ge=1,
|
| 458 |
+
description="TGI: Number of shards for sharded inference",
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
# vLLM-specific options
|
| 462 |
+
block_size: int | None = Field(
|
| 463 |
+
default=None,
|
| 464 |
+
ge=1,
|
| 465 |
+
description="vLLM: Block size for KV cache management (default: 16)",
|
| 466 |
+
)
|
| 467 |
+
swap_space_gb: float = Field(default=0.0, ge=0.0, description="vLLM: CPU swap space in GB")
|
| 468 |
+
enable_prefix_caching: bool = Field(default=False, description="vLLM: Enable prefix caching")
|
| 469 |
+
enforce_eager: bool = Field(
|
| 470 |
+
default=False,
|
| 471 |
+
description="vLLM: Enable eager mode (disable CUDA graph)",
|
| 472 |
+
)
|
| 473 |
+
max_num_batched_tokens: int | None = Field(
|
| 474 |
+
default=None,
|
| 475 |
+
gt=0,
|
| 476 |
+
description="vLLM: Maximum number of batched tokens",
|
| 477 |
+
)
|
| 478 |
+
max_num_seqs: int | None = Field(
|
| 479 |
+
default=None,
|
| 480 |
+
gt=0,
|
| 481 |
+
description="vLLM: Maximum number of sequences in a batch",
|
| 482 |
+
)
|
| 483 |
+
vllm_quantization: Literal["none", "awq", "gptq", "squeezellm", "fp8"] = Field(
|
| 484 |
+
default="none",
|
| 485 |
+
description="vLLM: Weight quantization method",
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
# TensorRT-LLM-specific options
|
| 489 |
+
trt_max_batch_size: int | None = Field(
|
| 490 |
+
default=None,
|
| 491 |
+
gt=0,
|
| 492 |
+
description="TensorRT-LLM: Maximum batch size",
|
| 493 |
+
)
|
| 494 |
+
trt_max_input_len: int | None = Field(
|
| 495 |
+
default=None,
|
| 496 |
+
gt=0,
|
| 497 |
+
description="TensorRT-LLM: Maximum input length",
|
| 498 |
+
)
|
| 499 |
+
trt_max_seq_len: int | None = Field(
|
| 500 |
+
default=None,
|
| 501 |
+
gt=0,
|
| 502 |
+
description="TensorRT-LLM: Maximum sequence length",
|
| 503 |
+
)
|
| 504 |
+
trt_max_beam_width: int | None = Field(
|
| 505 |
+
default=None,
|
| 506 |
+
ge=1,
|
| 507 |
+
description="TensorRT-LLM: Maximum beam width for beam search",
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
# SGLang-specific options
|
| 511 |
+
chunk_size: int | None = Field(
|
| 512 |
+
default=None,
|
| 513 |
+
ge=1,
|
| 514 |
+
description="SGLang: Prefill chunk size for long contexts (default: 8192)",
|
| 515 |
+
)
|
| 516 |
+
max_running_requests: int | None = Field(
|
| 517 |
+
default=None,
|
| 518 |
+
ge=1,
|
| 519 |
+
description="SGLang: Maximum number of concurrent requests",
|
| 520 |
+
)
|
| 521 |
+
disable_radix_cache: bool = Field(
|
| 522 |
+
default=False,
|
| 523 |
+
description="SGLang: Disable RadixAttention cache (for debugging)",
|
| 524 |
+
)
|
| 525 |
+
enable_p2p: bool = Field(
|
| 526 |
+
default=False,
|
| 527 |
+
description="SGLang: Enable P2P attention for multi-GPU",
|
| 528 |
+
)
|
| 529 |
+
disable_custom_all_reduce: bool = Field(
|
| 530 |
+
default=False,
|
| 531 |
+
description="SGLang: Disable custom all-reduce kernel",
|
| 532 |
+
)
|
| 533 |
+
attention_backend: Literal["flashinfer", "triton", "torch"] = Field(
|
| 534 |
+
default="flashinfer",
|
| 535 |
+
description="SGLang: Attention backend implementation",
|
| 536 |
+
)
|
| 537 |
+
enable_torch_compile: bool = Field(
|
| 538 |
+
default=False,
|
| 539 |
+
description="SGLang: Enable torch.compile for model optimization",
|
| 540 |
+
)
|
| 541 |
+
radix_cache_max_seq_len: int | None = Field(
|
| 542 |
+
default=None,
|
| 543 |
+
gt=0,
|
| 544 |
+
description="SGLang: Maximum sequence length for RadixCache",
|
| 545 |
+
)
|
| 546 |
+
speculative_algo: Literal["default", "medusa", "eagle"] = Field(
|
| 547 |
+
default="default",
|
| 548 |
+
description="SGLang: Speculative decoding algorithm",
|
| 549 |
+
)
|
| 550 |
+
multi_lora_enabled: bool = Field(default=False, description="SGLang: Enable multi-LoRA serving")
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
class InferenceMemoryResult(BaseModel):
|
| 554 |
+
"""Inference memory calculation result."""
|
| 555 |
+
|
| 556 |
+
total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
|
| 557 |
+
total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
|
| 558 |
+
breakdown: InferenceMemoryBreakdown = Field(description="Memory breakdown by component")
|
| 559 |
+
fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
|
| 560 |
+
memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
|
| 561 |
+
max_supported_batch_size: int | None = Field(
|
| 562 |
+
default=None,
|
| 563 |
+
description="Maximum batch size that fits in GPU memory",
|
| 564 |
+
)
|
| 565 |
+
estimated_throughput_tokens_per_sec: float | None = Field(
|
| 566 |
+
default=None,
|
| 567 |
+
description="Estimated throughput in tokens/second",
|
| 568 |
+
)
|
src/gpu_mem_calculator/core/multinode.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Multi-node training calculator.
|
| 2 |
+
|
| 3 |
+
Handles network communication overhead calculation and hybrid
|
| 4 |
+
parallelism optimization for multi-node training configurations.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from gpu_mem_calculator.core.models import (
|
| 8 |
+
EngineConfig,
|
| 9 |
+
EngineType,
|
| 10 |
+
HybridParallelismConfig,
|
| 11 |
+
ModelConfig,
|
| 12 |
+
NetworkOverhead,
|
| 13 |
+
NodeConfig,
|
| 14 |
+
ParallelismConfig,
|
| 15 |
+
TrainingConfig,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class MultiNodeCalculator:
|
| 20 |
+
"""Calculator for multi-node training overhead and optimization.
|
| 21 |
+
|
| 22 |
+
This class provides:
|
| 23 |
+
- Network communication overhead estimation
|
| 24 |
+
- Hybrid parallelism strategy optimization
|
| 25 |
+
- Multi-node performance modeling
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
model_config: ModelConfig,
|
| 31 |
+
training_config: TrainingConfig,
|
| 32 |
+
parallelism_config: ParallelismConfig,
|
| 33 |
+
node_config: NodeConfig,
|
| 34 |
+
engine_config: EngineConfig,
|
| 35 |
+
) -> None:
|
| 36 |
+
"""Initialize the multi-node calculator.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
model_config: Model architecture configuration
|
| 40 |
+
training_config: Training hyperparameters
|
| 41 |
+
parallelism_config: Parallelism settings
|
| 42 |
+
node_config: Multi-node hardware configuration
|
| 43 |
+
engine_config: Training engine configuration
|
| 44 |
+
"""
|
| 45 |
+
self.model_config = model_config
|
| 46 |
+
self.training_config = training_config
|
| 47 |
+
self.parallelism_config = parallelism_config
|
| 48 |
+
self.node_config = node_config
|
| 49 |
+
self.engine_config = engine_config
|
| 50 |
+
|
| 51 |
+
def calculate_network_overhead(self) -> NetworkOverhead:
|
| 52 |
+
"""Calculate network communication overhead for multi-node training.
|
| 53 |
+
|
| 54 |
+
Estimates communication overhead for different collective operations
|
| 55 |
+
based on model size, parallelism strategy, and interconnect bandwidth.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
NetworkOverhead with detailed breakdown
|
| 59 |
+
"""
|
| 60 |
+
if not self.node_config.is_multi_node:
|
| 61 |
+
return NetworkOverhead()
|
| 62 |
+
|
| 63 |
+
# Get model size in bytes
|
| 64 |
+
model_params = self.model_config.num_parameters
|
| 65 |
+
dtype_bytes = self._get_dtype_bytes()
|
| 66 |
+
model_size_bytes = int(model_params * dtype_bytes)
|
| 67 |
+
|
| 68 |
+
# Calculate communication for each collective operation
|
| 69 |
+
allreduce_gb = self._calculate_allreduce_overhead(model_size_bytes)
|
| 70 |
+
allgather_gb = self._calculate_allgather_overhead(model_size_bytes)
|
| 71 |
+
reducescatter_gb = self._calculate_reducescatter_overhead(model_size_bytes)
|
| 72 |
+
point_to_point_gb = self._calculate_pipeline_overhead(model_size_bytes)
|
| 73 |
+
|
| 74 |
+
total_overhead_gb = allreduce_gb + allgather_gb + reducescatter_gb + point_to_point_gb
|
| 75 |
+
|
| 76 |
+
# Estimate time overhead per step
|
| 77 |
+
overhead_ms = self._estimate_communication_time_ms(total_overhead_gb)
|
| 78 |
+
|
| 79 |
+
return NetworkOverhead(
|
| 80 |
+
allreduce_gb=allreduce_gb,
|
| 81 |
+
allgather_gb=allgather_gb,
|
| 82 |
+
reducescatter_gb=reducescatter_gb,
|
| 83 |
+
point_to_point_gb=point_to_point_gb,
|
| 84 |
+
total_overhead_gb=total_overhead_gb,
|
| 85 |
+
estimated_overhead_ms_per_step=overhead_ms,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def optimize_hybrid_parallelism(
|
| 89 |
+
self,
|
| 90 |
+
hybrid_config: HybridParallelismConfig,
|
| 91 |
+
) -> ParallelismConfig:
|
| 92 |
+
"""Optimize hybrid parallelism strategy for multi-node training.
|
| 93 |
+
|
| 94 |
+
Analyzes the hardware configuration and model characteristics
|
| 95 |
+
to recommend optimal parallelism degrees.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
hybrid_config: Hybrid parallelism configuration and preferences
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Optimized ParallelismConfig
|
| 102 |
+
"""
|
| 103 |
+
if not hybrid_config.auto_optimize:
|
| 104 |
+
return self.parallelism_config
|
| 105 |
+
|
| 106 |
+
num_nodes = self.node_config.num_nodes
|
| 107 |
+
gpus_per_node = self.node_config.gpus_per_node or 1
|
| 108 |
+
total_gpus = num_nodes * gpus_per_node
|
| 109 |
+
|
| 110 |
+
seq_len = self.model_config.max_seq_len
|
| 111 |
+
|
| 112 |
+
# Determine optimal parallelism strategy
|
| 113 |
+
if seq_len >= hybrid_config.sequence_parallel_threshold:
|
| 114 |
+
# Enable sequence parallel for long sequences
|
| 115 |
+
enable_sp = True
|
| 116 |
+
else:
|
| 117 |
+
enable_sp = hybrid_config.enable_sequence_parallel
|
| 118 |
+
|
| 119 |
+
# Calculate parallelism degrees
|
| 120 |
+
if hybrid_config.prefer_pipeline_parallel and num_nodes > 1:
|
| 121 |
+
# Prefer pipeline parallel across nodes
|
| 122 |
+
pp_size = int(min(num_nodes, 8)) # Limit pipeline stages
|
| 123 |
+
tp_size = int(min(gpus_per_node, 8)) # Tensor parallel within node
|
| 124 |
+
dp_size = int(total_gpus // (pp_size * tp_size))
|
| 125 |
+
else:
|
| 126 |
+
# Default: maximize data parallel
|
| 127 |
+
tp_size = 1
|
| 128 |
+
pp_size = 1
|
| 129 |
+
dp_size = int(total_gpus)
|
| 130 |
+
|
| 131 |
+
# Ensure all values are at least 1
|
| 132 |
+
tp_size = max(1, tp_size)
|
| 133 |
+
pp_size = max(1, pp_size)
|
| 134 |
+
dp_size = max(1, dp_size)
|
| 135 |
+
|
| 136 |
+
return ParallelismConfig(
|
| 137 |
+
tensor_parallel_size=tp_size,
|
| 138 |
+
pipeline_parallel_size=pp_size,
|
| 139 |
+
data_parallel_size=dp_size,
|
| 140 |
+
sequence_parallel=enable_sp,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
def _calculate_allreduce_overhead(self, model_size_bytes: int) -> float:
|
| 144 |
+
"""Calculate AllReduce communication overhead.
|
| 145 |
+
|
| 146 |
+
AllReduce is used for gradient averaging in data parallel training.
|
| 147 |
+
Algorithm: Ring AllReduce with O(2 * model_size) communication.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
model_size_bytes: Model size in bytes
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
Communication volume in GB
|
| 154 |
+
"""
|
| 155 |
+
# Ring AllReduce: each GPU sends/receives 2 * model_size / num_gpus
|
| 156 |
+
# But we need the total across the network
|
| 157 |
+
|
| 158 |
+
# For gradient averaging: 2 * model_size (send + receive)
|
| 159 |
+
allreduce_bytes = 2 * model_size_bytes
|
| 160 |
+
|
| 161 |
+
# Adjust for collective operation efficiency
|
| 162 |
+
# In multi-node, cross-node traffic is the bottleneck
|
| 163 |
+
if self.node_config.is_multi_node:
|
| 164 |
+
# Only cross-node traffic matters
|
| 165 |
+
allreduce_bytes = int(allreduce_bytes / self.node_config.num_nodes)
|
| 166 |
+
|
| 167 |
+
return allreduce_bytes / (1024**3)
|
| 168 |
+
|
| 169 |
+
def _calculate_allgather_overhead(self, model_size_bytes: int) -> float:
|
| 170 |
+
"""Calculate AllGather communication overhead.
|
| 171 |
+
|
| 172 |
+
AllGather is used in ZeRO-3 and tensor parallel for parameter gathering.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
model_size_bytes: Model size in bytes
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
Communication volume in GB
|
| 179 |
+
"""
|
| 180 |
+
# AllGather: (num_gpus - 1) * model_size / num_gpus per GPU
|
| 181 |
+
# But for ZeRO-3, we gather all parameters
|
| 182 |
+
is_zero3 = (
|
| 183 |
+
self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 3
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
if is_zero3:
|
| 187 |
+
# ZeRO-3 gathers all parameters during forward pass
|
| 188 |
+
allgather_bytes = model_size_bytes
|
| 189 |
+
else:
|
| 190 |
+
# Standard allgather for tensor parallel
|
| 191 |
+
allgather_bytes = int(model_size_bytes / self.parallelism_config.tensor_parallel_size)
|
| 192 |
+
|
| 193 |
+
# Adjust for multi-node
|
| 194 |
+
if self.node_config.is_multi_node:
|
| 195 |
+
allgather_bytes = int(allgather_bytes / self.node_config.num_nodes)
|
| 196 |
+
|
| 197 |
+
return allgather_bytes / (1024**3)
|
| 198 |
+
|
| 199 |
+
def _calculate_reducescatter_overhead(self, model_size_bytes: int) -> float:
|
| 200 |
+
"""Calculate ReduceScatter communication overhead.
|
| 201 |
+
|
| 202 |
+
ReduceScatter is used in ZeRO-2 and gradient sharding.
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
model_size_bytes: Model size in bytes
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
Communication volume in GB
|
| 209 |
+
"""
|
| 210 |
+
is_zero2 = (
|
| 211 |
+
self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 2
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
if is_zero2:
|
| 215 |
+
# ZeRO-2 scatters gradients
|
| 216 |
+
reducescatter_bytes = model_size_bytes
|
| 217 |
+
else:
|
| 218 |
+
# Standard reducescatter
|
| 219 |
+
reducescatter_bytes = int(model_size_bytes / self.parallelism_config.data_parallel_size)
|
| 220 |
+
|
| 221 |
+
# Adjust for multi-node
|
| 222 |
+
if self.node_config.is_multi_node:
|
| 223 |
+
reducescatter_bytes = int(reducescatter_bytes / self.node_config.num_nodes)
|
| 224 |
+
|
| 225 |
+
return reducescatter_bytes / (1024**3)
|
| 226 |
+
|
| 227 |
+
def _calculate_pipeline_overhead(self, model_size_bytes: int) -> float:
|
| 228 |
+
"""Calculate pipeline parallel communication overhead.
|
| 229 |
+
|
| 230 |
+
Point-to-point communication between pipeline stages.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
model_size_bytes: Model size in bytes
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
Communication volume in GB
|
| 237 |
+
"""
|
| 238 |
+
if self.parallelism_config.pipeline_parallel_size <= 1:
|
| 239 |
+
return 0.0
|
| 240 |
+
|
| 241 |
+
# Pipeline parallel sends activations between stages
|
| 242 |
+
# Approximate as layer activations
|
| 243 |
+
hidden_size = self.model_config.hidden_size
|
| 244 |
+
seq_len = self.model_config.max_seq_len
|
| 245 |
+
batch_size = self.training_config.batch_size
|
| 246 |
+
num_layers = self.model_config.num_layers
|
| 247 |
+
|
| 248 |
+
# Activation size per layer
|
| 249 |
+
activation_bytes = batch_size * seq_len * hidden_size * 2 # FP16/BF16
|
| 250 |
+
|
| 251 |
+
# Number of microbatches determines communication frequency
|
| 252 |
+
# For simplicity, assume num_stages communications per step
|
| 253 |
+
pp_size = self.parallelism_config.pipeline_parallel_size
|
| 254 |
+
pipeline_comm_bytes = activation_bytes * (num_layers // pp_size)
|
| 255 |
+
|
| 256 |
+
# Adjust for multi-node
|
| 257 |
+
if self.node_config.is_multi_node:
|
| 258 |
+
pipeline_comm_bytes = int(pipeline_comm_bytes / self.node_config.num_nodes)
|
| 259 |
+
|
| 260 |
+
return pipeline_comm_bytes / (1024**3)
|
| 261 |
+
|
| 262 |
+
def _estimate_communication_time_ms(self, total_gb: float) -> float:
|
| 263 |
+
"""Estimate communication time per training step in milliseconds.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
total_gb: Total communication volume in GB
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
Estimated time in milliseconds
|
| 270 |
+
"""
|
| 271 |
+
if total_gb == 0:
|
| 272 |
+
return 0.0
|
| 273 |
+
|
| 274 |
+
# Get bandwidth in GB/s
|
| 275 |
+
bandwidth_gbps = self.node_config.get_interconnect_bandwidth_gbps()
|
| 276 |
+
bandwidth_gbps_per_sec = bandwidth_gbps / 8 # Convert to GB/s
|
| 277 |
+
|
| 278 |
+
# Basic time = size / bandwidth
|
| 279 |
+
time_seconds = total_gb / bandwidth_gbps_per_sec
|
| 280 |
+
|
| 281 |
+
# Add latency overhead for collective operations
|
| 282 |
+
# Typical latency: 10-50 microseconds per hop
|
| 283 |
+
num_nodes = self.node_config.num_nodes
|
| 284 |
+
latency_overhead = num_nodes * 0.00005 # 50 microseconds per node
|
| 285 |
+
|
| 286 |
+
# Network efficiency factor (not 100% efficient)
|
| 287 |
+
efficiency = 0.85
|
| 288 |
+
|
| 289 |
+
total_time_seconds = (time_seconds / efficiency) + latency_overhead
|
| 290 |
+
|
| 291 |
+
return total_time_seconds * 1000 # Convert to ms
|
| 292 |
+
|
| 293 |
+
def _get_dtype_bytes(self) -> float:
|
| 294 |
+
"""Get bytes per element based on dtype."""
|
| 295 |
+
dtype_map = {
|
| 296 |
+
"fp32": 4,
|
| 297 |
+
"fp16": 2,
|
| 298 |
+
"bf16": 2,
|
| 299 |
+
"int8": 1,
|
| 300 |
+
"int4": 0.5,
|
| 301 |
+
}
|
| 302 |
+
return dtype_map.get(self.training_config.dtype.value, 2)
|
| 303 |
+
|
| 304 |
+
def _calculate_model_size_gb(self) -> float:
|
| 305 |
+
"""Calculate model size in GB."""
|
| 306 |
+
dtype_bytes = self._get_dtype_bytes()
|
| 307 |
+
model_size_bytes = self.model_config.num_parameters * dtype_bytes
|
| 308 |
+
return model_size_bytes / (1024**3)
|
src/gpu_mem_calculator/engines/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Training engine implementations."""
|
| 2 |
+
|
| 3 |
+
from gpu_mem_calculator.engines.base import BaseEngine
|
| 4 |
+
from gpu_mem_calculator.engines.deepspeed import DeepSpeedEngine
|
| 5 |
+
from gpu_mem_calculator.engines.fsdp import FSDPEngine
|
| 6 |
+
from gpu_mem_calculator.engines.megatron import MegatronDeepSpeedEngine, MegatronLMEngine
|
| 7 |
+
from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"BaseEngine",
|
| 11 |
+
"PyTorchDDPEngine",
|
| 12 |
+
"DeepSpeedEngine",
|
| 13 |
+
"MegatronLMEngine",
|
| 14 |
+
"MegatronDeepSpeedEngine",
|
| 15 |
+
"FSDPEngine",
|
| 16 |
+
]
|
src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (688 Bytes). View file
|
|
|
src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc
ADDED
|
Binary file (8.07 kB). View file
|
|
|
src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc
ADDED
|
Binary file (8.07 kB). View file
|
|
|
src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc
ADDED
|
Binary file (8.5 kB). View file
|
|
|
src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc
ADDED
|
Binary file (3.73 kB). View file
|
|
|
src/gpu_mem_calculator/engines/base.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base class for training engine implementations."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
|
| 5 |
+
from gpu_mem_calculator.core.models import (
|
| 6 |
+
EngineConfig,
|
| 7 |
+
GPUConfig,
|
| 8 |
+
MemoryBreakdown,
|
| 9 |
+
MemoryResult,
|
| 10 |
+
ModelConfig,
|
| 11 |
+
NodeConfig,
|
| 12 |
+
ParallelismConfig,
|
| 13 |
+
TrainingConfig,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class BaseEngine(ABC):
|
| 18 |
+
"""Abstract base class for training engine memory calculation.
|
| 19 |
+
|
| 20 |
+
Each training engine (PyTorch DDP, DeepSpeed, Megatron-LM, etc.)
|
| 21 |
+
should implement this interface to provide engine-specific
|
| 22 |
+
memory calculations.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
model_config: ModelConfig,
|
| 28 |
+
training_config: TrainingConfig,
|
| 29 |
+
parallelism_config: ParallelismConfig,
|
| 30 |
+
engine_config: EngineConfig,
|
| 31 |
+
gpu_config: GPUConfig,
|
| 32 |
+
node_config: NodeConfig | None = None,
|
| 33 |
+
) -> None:
|
| 34 |
+
"""Initialize the engine with configuration.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
model_config: Model architecture configuration
|
| 38 |
+
training_config: Training hyperparameters
|
| 39 |
+
parallelism_config: Parallelism settings
|
| 40 |
+
engine_config: Engine-specific configuration
|
| 41 |
+
gpu_config: Hardware configuration
|
| 42 |
+
node_config: Multi-node configuration (optional)
|
| 43 |
+
"""
|
| 44 |
+
self.model_config = model_config
|
| 45 |
+
self.training_config = training_config
|
| 46 |
+
self.parallelism_config = parallelism_config
|
| 47 |
+
self.engine_config = engine_config
|
| 48 |
+
self.gpu_config = gpu_config
|
| 49 |
+
self.node_config = node_config or NodeConfig()
|
| 50 |
+
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def calculate_memory(self) -> MemoryResult:
|
| 53 |
+
"""Calculate memory requirements for this engine.
|
| 54 |
+
|
| 55 |
+
This is the main method that should be implemented by each engine.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
MemoryResult with complete memory breakdown
|
| 59 |
+
"""
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
def _check_feasibility(
|
| 63 |
+
self,
|
| 64 |
+
total_memory_per_gpu: float,
|
| 65 |
+
) -> tuple[bool, float, int | None]:
|
| 66 |
+
"""Check if the configuration fits on available GPU.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
total_memory_per_gpu: Total memory required per GPU
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Tuple of (fits_on_gpu, utilization_percent, recommended_batch_size)
|
| 73 |
+
"""
|
| 74 |
+
available_memory = self.gpu_config.gpu_memory_gb
|
| 75 |
+
utilization_percent = (total_memory_per_gpu / available_memory) * 100
|
| 76 |
+
|
| 77 |
+
fits_on_gpu = total_memory_per_gpu <= available_memory
|
| 78 |
+
|
| 79 |
+
# If doesn't fit, suggest a smaller batch size
|
| 80 |
+
recommended_batch_size = None
|
| 81 |
+
if not fits_on_gpu:
|
| 82 |
+
# Simple heuristic: scale batch size inversely with memory excess
|
| 83 |
+
excess_factor = total_memory_per_gpu / available_memory
|
| 84 |
+
recommended_batch_size = max(1, int(self.training_config.batch_size / excess_factor))
|
| 85 |
+
|
| 86 |
+
return fits_on_gpu, utilization_percent, recommended_batch_size
|
| 87 |
+
|
| 88 |
+
def _create_result(
|
| 89 |
+
self,
|
| 90 |
+
breakdown: MemoryBreakdown,
|
| 91 |
+
cpu_memory_gb: float = 0.0,
|
| 92 |
+
) -> MemoryResult:
|
| 93 |
+
"""Create a MemoryResult from breakdown.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
breakdown: Memory breakdown by component
|
| 97 |
+
cpu_memory_gb: CPU memory required (default 0)
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Complete MemoryResult
|
| 101 |
+
"""
|
| 102 |
+
total_memory_per_gpu = breakdown.total_memory_gb
|
| 103 |
+
total_memory_all_gpus = total_memory_per_gpu * self.gpu_config.num_gpus
|
| 104 |
+
|
| 105 |
+
fits_on_gpu, utilization_percent, recommended_batch_size = self._check_feasibility(
|
| 106 |
+
total_memory_per_gpu
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Calculate network overhead for multi-node configurations
|
| 110 |
+
network_overhead = None
|
| 111 |
+
multi_node_info = None
|
| 112 |
+
if self.node_config.is_multi_node:
|
| 113 |
+
from gpu_mem_calculator.core.multinode import MultiNodeCalculator
|
| 114 |
+
|
| 115 |
+
multinode_calc = MultiNodeCalculator(
|
| 116 |
+
model_config=self.model_config,
|
| 117 |
+
training_config=self.training_config,
|
| 118 |
+
parallelism_config=self.parallelism_config,
|
| 119 |
+
node_config=self.node_config,
|
| 120 |
+
engine_config=self.engine_config,
|
| 121 |
+
)
|
| 122 |
+
network_overhead = multinode_calc.calculate_network_overhead()
|
| 123 |
+
|
| 124 |
+
# Add multi-node info
|
| 125 |
+
multi_node_info = {
|
| 126 |
+
"num_nodes": self.node_config.num_nodes,
|
| 127 |
+
"gpus_per_node": self.node_config.gpus_per_node,
|
| 128 |
+
"interconnect_type": self.node_config.interconnect_type.value,
|
| 129 |
+
"interconnect_bandwidth_gbps": self.node_config.get_interconnect_bandwidth_gbps(),
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
return MemoryResult(
|
| 133 |
+
total_memory_per_gpu_gb=total_memory_per_gpu,
|
| 134 |
+
total_memory_all_gpus_gb=total_memory_all_gpus,
|
| 135 |
+
cpu_memory_gb=cpu_memory_gb,
|
| 136 |
+
breakdown=breakdown,
|
| 137 |
+
network_overhead=network_overhead,
|
| 138 |
+
fits_on_gpu=fits_on_gpu,
|
| 139 |
+
memory_utilization_percent=utilization_percent,
|
| 140 |
+
recommended_batch_size=recommended_batch_size,
|
| 141 |
+
multi_node_info=multi_node_info,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
@property
|
| 145 |
+
def effective_batch_size(self) -> int:
|
| 146 |
+
"""Calculate effective batch size with gradient accumulation."""
|
| 147 |
+
return (
|
| 148 |
+
self.training_config.batch_size
|
| 149 |
+
* self.training_config.gradient_accumulation_steps
|
| 150 |
+
* self.parallelism_config.data_parallel_size
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
@property
|
| 154 |
+
def total_num_gpus(self) -> int:
|
| 155 |
+
"""Get total number of GPUs."""
|
| 156 |
+
return self.gpu_config.num_gpus
|
| 157 |
+
|
| 158 |
+
@property
|
| 159 |
+
def num_gpus_per_model(self) -> int:
|
| 160 |
+
"""Get number of GPUs per model replica.
|
| 161 |
+
|
| 162 |
+
This is tensor_parallel * pipeline_parallel for distributed training.
|
| 163 |
+
"""
|
| 164 |
+
return (
|
| 165 |
+
self.parallelism_config.tensor_parallel_size
|
| 166 |
+
* self.parallelism_config.pipeline_parallel_size
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
def calculate_moe_activation_multiplier(self) -> float:
|
| 170 |
+
"""Calculate activation memory multiplier for MoE models.
|
| 171 |
+
|
| 172 |
+
For MoE models, activation memory depends on top_k (active experts per token)
|
| 173 |
+
rather than total number of experts. This is because only top_k experts
|
| 174 |
+
are activated per token during forward/backward pass.
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Multiplier for activation memory (1.0 for dense models, <1 for MoE)
|
| 178 |
+
"""
|
| 179 |
+
if not self.model_config.moe_enabled:
|
| 180 |
+
return 1.0
|
| 181 |
+
|
| 182 |
+
# For MoE: only top_k experts are active per token
|
| 183 |
+
# Activation memory scales with active_experts / total_experts
|
| 184 |
+
# But we also have router overhead and gating network activations
|
| 185 |
+
|
| 186 |
+
num_experts = self.model_config.num_experts
|
| 187 |
+
top_k = self.model_config.top_k
|
| 188 |
+
|
| 189 |
+
# Base activation ratio: only top_k experts active
|
| 190 |
+
activation_ratio = top_k / num_experts
|
| 191 |
+
|
| 192 |
+
# Add router overhead (typically 5-15% extra for gating)
|
| 193 |
+
router_overhead = 0.1
|
| 194 |
+
|
| 195 |
+
# For models with shared experts (like GLM), adjust accordingly
|
| 196 |
+
if self.model_config.shared_expert_intermediate_size:
|
| 197 |
+
# Shared expert is always active, so add its contribution
|
| 198 |
+
# This is a simplified approximation
|
| 199 |
+
activation_ratio = activation_ratio + (1.0 / num_experts)
|
| 200 |
+
|
| 201 |
+
return min(1.0, activation_ratio + router_overhead)
|
| 202 |
+
|
| 203 |
+
def calculate_moe_parameter_ratio(self) -> float:
|
| 204 |
+
"""Calculate effective parameter ratio for MoE models.
|
| 205 |
+
|
| 206 |
+
For MoE models, only top_k experts are used during forward pass,
|
| 207 |
+
but all expert parameters are stored in memory.
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
Ratio of active parameters to total parameters (for memory estimation)
|
| 211 |
+
"""
|
| 212 |
+
if not self.model_config.moe_enabled:
|
| 213 |
+
return 1.0
|
| 214 |
+
|
| 215 |
+
# All expert parameters are stored, but only top_k are used per token
|
| 216 |
+
# For gradient calculation, we need gradients for all experts
|
| 217 |
+
# So parameter storage = 1.0 (all params stored)
|
| 218 |
+
# But we can use this for inference-specific calculations
|
| 219 |
+
|
| 220 |
+
return 1.0 # All parameters stored in memory
|
src/gpu_mem_calculator/engines/deepspeed.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DeepSpeed ZeRO engine implementation.
|
| 2 |
+
|
| 3 |
+
Implements memory calculations for DeepSpeed ZeRO stages 1, 2, and 3.
|
| 4 |
+
Based on: https://deepspeed.readthedocs.io/en/latest/memory.html
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from gpu_mem_calculator.core.formulas import (
|
| 8 |
+
calculate_activation_memory,
|
| 9 |
+
calculate_overhead,
|
| 10 |
+
estimate_largest_layer_params,
|
| 11 |
+
)
|
| 12 |
+
from gpu_mem_calculator.core.models import (
|
| 13 |
+
MemoryBreakdown,
|
| 14 |
+
MemoryResult,
|
| 15 |
+
OffloadDevice,
|
| 16 |
+
)
|
| 17 |
+
from gpu_mem_calculator.engines.base import BaseEngine
|
| 18 |
+
from gpu_mem_calculator.utils.precision import gb_from_bytes
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DeepSpeedEngine(BaseEngine):
|
| 22 |
+
"""DeepSpeed ZeRO memory calculation.
|
| 23 |
+
|
| 24 |
+
Implements ZeRO stages:
|
| 25 |
+
- ZeRO-1: Shard optimizer states
|
| 26 |
+
- ZeRO-2: Shard optimizer states + gradients
|
| 27 |
+
- ZeRO-3: Shard optimizer states + gradients + parameters
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def calculate_memory(self) -> MemoryResult:
|
| 31 |
+
"""Calculate memory requirements for DeepSpeed ZeRO training.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
MemoryResult with complete memory breakdown
|
| 35 |
+
"""
|
| 36 |
+
zero_stage = self.engine_config.zero_stage or 0
|
| 37 |
+
offload_optimizer = self.engine_config.offload_optimizer
|
| 38 |
+
offload_param = self.engine_config.offload_param
|
| 39 |
+
|
| 40 |
+
# Get largest layer params for ZeRO-3
|
| 41 |
+
if self.model_config.largest_layer_params is None:
|
| 42 |
+
largest_layer_params = estimate_largest_layer_params(
|
| 43 |
+
hidden_size=self.model_config.hidden_size,
|
| 44 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 45 |
+
)
|
| 46 |
+
else:
|
| 47 |
+
largest_layer_params = self.model_config.largest_layer_params
|
| 48 |
+
|
| 49 |
+
match zero_stage:
|
| 50 |
+
case 0:
|
| 51 |
+
return self._calculate_zero0()
|
| 52 |
+
case 1:
|
| 53 |
+
return self._calculate_zero1(offload_optimizer)
|
| 54 |
+
case 2:
|
| 55 |
+
return self._calculate_zero2(offload_optimizer)
|
| 56 |
+
case 3:
|
| 57 |
+
return self._calculate_zero3(
|
| 58 |
+
offload_optimizer,
|
| 59 |
+
offload_param,
|
| 60 |
+
largest_layer_params,
|
| 61 |
+
)
|
| 62 |
+
case _:
|
| 63 |
+
# Default to ZeRO-2
|
| 64 |
+
return self._calculate_zero2(offload_optimizer)
|
| 65 |
+
|
| 66 |
+
def _calculate_zero0(self) -> MemoryResult:
|
| 67 |
+
"""Calculate memory for ZeRO-0 (disabled, same as PyTorch DDP)."""
|
| 68 |
+
# Import here to avoid circular dependency
|
| 69 |
+
from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
|
| 70 |
+
|
| 71 |
+
# ZeRO-0 is the same as PyTorch DDP
|
| 72 |
+
ddp_engine = PyTorchDDPEngine(
|
| 73 |
+
model_config=self.model_config,
|
| 74 |
+
training_config=self.training_config,
|
| 75 |
+
parallelism_config=self.parallelism_config,
|
| 76 |
+
engine_config=self.engine_config,
|
| 77 |
+
gpu_config=self.gpu_config,
|
| 78 |
+
)
|
| 79 |
+
return ddp_engine.calculate_memory()
|
| 80 |
+
|
| 81 |
+
def _calculate_zero1(
|
| 82 |
+
self,
|
| 83 |
+
offload_optimizer: OffloadDevice,
|
| 84 |
+
) -> MemoryResult:
|
| 85 |
+
"""Calculate memory for ZeRO-1 (shard optimizer states).
|
| 86 |
+
|
| 87 |
+
ZeRO-1 shards optimizer states across data parallel GPUs.
|
| 88 |
+
|
| 89 |
+
Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
|
| 90 |
+
Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
| 91 |
+
|
| 92 |
+
Memory formula:
|
| 93 |
+
- offload_optimizer=cpu: 2 * params (fp16 params only on GPU)
|
| 94 |
+
- offload_optimizer=none: 4 * params (fp16 params + fp32 params) +
|
| 95 |
+
12 * params / num_gpus (sharded optimizer states)
|
| 96 |
+
|
| 97 |
+
Note: Optimizer states = 12 bytes per param for Adam/AdamW
|
| 98 |
+
- 4 bytes: FP32 parameter copy
|
| 99 |
+
- 4 bytes: Momentum (FP32)
|
| 100 |
+
- 4 bytes: Variance (FP32)
|
| 101 |
+
"""
|
| 102 |
+
num_params = self.model_config.num_parameters
|
| 103 |
+
num_gpus = self.total_num_gpus
|
| 104 |
+
|
| 105 |
+
# Model parameters (fp16/bf16 on GPU)
|
| 106 |
+
model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16 = 2 bytes
|
| 107 |
+
|
| 108 |
+
# Gradients (fp16 on GPU)
|
| 109 |
+
gradients_gb = gb_from_bytes(num_params * 2)
|
| 110 |
+
|
| 111 |
+
# Optimizer states (sharded across GPUs, possibly offloaded to CPU)
|
| 112 |
+
# 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
|
| 113 |
+
if offload_optimizer == OffloadDevice.CPU:
|
| 114 |
+
# Offloaded to CPU, minimal GPU memory for optimizer
|
| 115 |
+
optimizer_gb = 0.0
|
| 116 |
+
cpu_memory_gb = gb_from_bytes(num_params * 12) # Full optimizer on CPU
|
| 117 |
+
else:
|
| 118 |
+
# Sharded across GPUs: 12 bytes / num_gpus per GPU
|
| 119 |
+
optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
|
| 120 |
+
cpu_memory_gb = 0.0
|
| 121 |
+
|
| 122 |
+
# Activations (same as baseline)
|
| 123 |
+
activations_gb = calculate_activation_memory(
|
| 124 |
+
batch_size=self.training_config.batch_size,
|
| 125 |
+
seq_len=self.model_config.max_seq_len,
|
| 126 |
+
hidden_size=self.model_config.hidden_size,
|
| 127 |
+
num_layers=self.model_config.num_layers,
|
| 128 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 129 |
+
tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
|
| 130 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 131 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 132 |
+
num_experts=self.model_config.num_experts,
|
| 133 |
+
top_k=self.model_config.top_k,
|
| 134 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Overhead
|
| 138 |
+
base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
|
| 139 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 140 |
+
|
| 141 |
+
breakdown = MemoryBreakdown(
|
| 142 |
+
model_params_gb=model_params_gb,
|
| 143 |
+
gradients_gb=gradients_gb,
|
| 144 |
+
optimizer_states_gb=optimizer_gb,
|
| 145 |
+
activations_gb=activations_gb,
|
| 146 |
+
overhead_gb=overhead_gb,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return self._create_result(breakdown, cpu_memory_gb)
|
| 150 |
+
|
| 151 |
+
def _calculate_zero2(
|
| 152 |
+
self,
|
| 153 |
+
offload_optimizer: OffloadDevice,
|
| 154 |
+
) -> MemoryResult:
|
| 155 |
+
"""Calculate memory for ZeRO-2 (shard optimizer + gradients).
|
| 156 |
+
|
| 157 |
+
ZeRO-2 shards optimizer states AND gradients across data parallel GPUs.
|
| 158 |
+
|
| 159 |
+
Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
|
| 160 |
+
Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
| 161 |
+
|
| 162 |
+
Memory formula:
|
| 163 |
+
- offload_optimizer=cpu: 2 * params (fp16 params) +
|
| 164 |
+
(2 * params / num_gpus) (sharded fp16 grads)
|
| 165 |
+
- offload_optimizer=none: 2 * params (fp16 params) +
|
| 166 |
+
2 * params / num_gpus (sharded fp16 grads) +
|
| 167 |
+
12 * params / num_gpus (sharded optimizer states)
|
| 168 |
+
|
| 169 |
+
Note: Unlike ZeRO-1, ZeRO-2 shards gradients across GPUs
|
| 170 |
+
"""
|
| 171 |
+
num_params = self.model_config.num_parameters
|
| 172 |
+
num_gpus = self.total_num_gpus
|
| 173 |
+
|
| 174 |
+
# Model parameters (fp16/bf16 on GPU) - NOT sharded in ZeRO-2
|
| 175 |
+
model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16 = 2 bytes
|
| 176 |
+
|
| 177 |
+
# Gradients (fp16 on GPU) - SHARDED in ZeRO-2
|
| 178 |
+
gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 179 |
+
|
| 180 |
+
# Optimizer states (sharded across GPUs, possibly offloaded to CPU)
|
| 181 |
+
# 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
|
| 182 |
+
if offload_optimizer == OffloadDevice.CPU:
|
| 183 |
+
# Offloaded to CPU, minimal GPU memory for optimizer
|
| 184 |
+
optimizer_gb = 0.0
|
| 185 |
+
cpu_memory_gb = gb_from_bytes(num_params * 12) # Full optimizer on CPU
|
| 186 |
+
else:
|
| 187 |
+
# Sharded across GPUs: 12 bytes / num_gpus per GPU
|
| 188 |
+
optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
|
| 189 |
+
cpu_memory_gb = 0.0
|
| 190 |
+
|
| 191 |
+
# Activations (same as baseline)
|
| 192 |
+
activations_gb = calculate_activation_memory(
|
| 193 |
+
batch_size=self.training_config.batch_size,
|
| 194 |
+
seq_len=self.model_config.max_seq_len,
|
| 195 |
+
hidden_size=self.model_config.hidden_size,
|
| 196 |
+
num_layers=self.model_config.num_layers,
|
| 197 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 198 |
+
tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
|
| 199 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 200 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 201 |
+
num_experts=self.model_config.num_experts,
|
| 202 |
+
top_k=self.model_config.top_k,
|
| 203 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Overhead
|
| 207 |
+
base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
|
| 208 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 209 |
+
|
| 210 |
+
breakdown = MemoryBreakdown(
|
| 211 |
+
model_params_gb=model_params_gb,
|
| 212 |
+
gradients_gb=gradients_gb,
|
| 213 |
+
optimizer_states_gb=optimizer_gb,
|
| 214 |
+
activations_gb=activations_gb,
|
| 215 |
+
overhead_gb=overhead_gb,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
return self._create_result(breakdown, cpu_memory_gb)
|
| 219 |
+
|
| 220 |
+
def _calculate_zero3(
|
| 221 |
+
self,
|
| 222 |
+
offload_optimizer: OffloadDevice,
|
| 223 |
+
offload_param: OffloadDevice,
|
| 224 |
+
largest_layer_params: int,
|
| 225 |
+
) -> MemoryResult:
|
| 226 |
+
"""Calculate memory for ZeRO-3 (shard params + optimizer + gradients).
|
| 227 |
+
|
| 228 |
+
ZeRO-3 shards everything across GPUs.
|
| 229 |
+
|
| 230 |
+
Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
|
| 231 |
+
Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
| 232 |
+
|
| 233 |
+
Memory formula:
|
| 234 |
+
- largest_layer_memory = 4 * largest_layer_params (fp16 params + fp16 grads)
|
| 235 |
+
|
| 236 |
+
Case 1 (no offload):
|
| 237 |
+
largest_layer_memory + 18 * params / num_gpus
|
| 238 |
+
(where 18 = 16 bytes optimizer states + 2 bytes fp16 params)
|
| 239 |
+
|
| 240 |
+
Case 2 (param + optimizer offload to CPU):
|
| 241 |
+
largest_layer_memory (main limit is CPU RAM)
|
| 242 |
+
|
| 243 |
+
Case 3 (optimizer offload to CPU only):
|
| 244 |
+
largest_layer_memory + 2 * params / num_gpus
|
| 245 |
+
|
| 246 |
+
Note: Optimizer states = 16 bytes per param for Adam/AdamW (FP32)
|
| 247 |
+
- 4 bytes: FP32 parameter copy
|
| 248 |
+
- 4 bytes: Momentum (FP32)
|
| 249 |
+
- 4 bytes: Variance (FP32)
|
| 250 |
+
- 4 bytes: Gradient (FP32 copy for optimizer update)
|
| 251 |
+
"""
|
| 252 |
+
num_params = self.model_config.num_parameters
|
| 253 |
+
num_gpus = self.total_num_gpus
|
| 254 |
+
|
| 255 |
+
# Largest layer memory (fp16 params + fp16 grads gathered on one GPU)
|
| 256 |
+
largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
|
| 257 |
+
|
| 258 |
+
# Calculate memory based on offload configuration
|
| 259 |
+
if offload_param == OffloadDevice.CPU and offload_optimizer == OffloadDevice.CPU:
|
| 260 |
+
# Case 2: Both params and optimizer offloaded to CPU
|
| 261 |
+
# Only need largest layer on GPU at a time
|
| 262 |
+
params_per_gpu_gb = 0.0
|
| 263 |
+
gradients_per_gpu_gb = 0.0
|
| 264 |
+
optimizer_gb = 0.0
|
| 265 |
+
cpu_memory_gb = gb_from_bytes(num_params * 18) # Full model on CPU
|
| 266 |
+
elif offload_optimizer == OffloadDevice.CPU:
|
| 267 |
+
# Case 3: Only optimizer offloaded to CPU
|
| 268 |
+
params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 269 |
+
gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 270 |
+
optimizer_gb = 0.0
|
| 271 |
+
cpu_memory_gb = gb_from_bytes(num_params * 16) # Optimizer on CPU
|
| 272 |
+
else:
|
| 273 |
+
# Case 1: No offload
|
| 274 |
+
params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 275 |
+
gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 276 |
+
optimizer_gb = gb_from_bytes((num_params * 16) / num_gpus) # FP32
|
| 277 |
+
cpu_memory_gb = 0.0
|
| 278 |
+
|
| 279 |
+
# Model params = largest layer for ZeRO-3
|
| 280 |
+
model_params_gb = largest_layer_memory_gb
|
| 281 |
+
|
| 282 |
+
# Activations
|
| 283 |
+
activations_gb = calculate_activation_memory(
|
| 284 |
+
batch_size=self.training_config.batch_size,
|
| 285 |
+
seq_len=self.model_config.max_seq_len,
|
| 286 |
+
hidden_size=self.model_config.hidden_size,
|
| 287 |
+
num_layers=self.model_config.num_layers,
|
| 288 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 289 |
+
tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
|
| 290 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 291 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 292 |
+
num_experts=self.model_config.num_experts,
|
| 293 |
+
top_k=self.model_config.top_k,
|
| 294 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# Overhead
|
| 298 |
+
base_memory = (
|
| 299 |
+
model_params_gb
|
| 300 |
+
+ params_per_gpu_gb
|
| 301 |
+
+ gradients_per_gpu_gb
|
| 302 |
+
+ optimizer_gb
|
| 303 |
+
+ activations_gb
|
| 304 |
+
)
|
| 305 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 306 |
+
|
| 307 |
+
# For ZeRO-3, we combine params/gradients/optimizer into model_params in breakdown
|
| 308 |
+
breakdown = MemoryBreakdown(
|
| 309 |
+
model_params_gb=model_params_gb + params_per_gpu_gb,
|
| 310 |
+
gradients_gb=gradients_per_gpu_gb,
|
| 311 |
+
optimizer_states_gb=optimizer_gb,
|
| 312 |
+
activations_gb=activations_gb,
|
| 313 |
+
overhead_gb=overhead_gb,
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
return self._create_result(breakdown, cpu_memory_gb)
|
src/gpu_mem_calculator/engines/fsdp.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FSDP (Fully Sharded Data Parallel) engine implementation.
|
| 2 |
+
|
| 3 |
+
Implements memory calculations for PyTorch FSDP.
|
| 4 |
+
|
| 5 |
+
Reference: https://pytorch.org/docs/stable/fsdp.html
|
| 6 |
+
Reference: https://blog.eleuther.ai/transformer-math/
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from gpu_mem_calculator.core.formulas import (
|
| 10 |
+
calculate_activation_memory,
|
| 11 |
+
calculate_overhead,
|
| 12 |
+
estimate_largest_layer_params,
|
| 13 |
+
)
|
| 14 |
+
from gpu_mem_calculator.core.models import (
|
| 15 |
+
MemoryBreakdown,
|
| 16 |
+
MemoryResult,
|
| 17 |
+
)
|
| 18 |
+
from gpu_mem_calculator.engines.base import BaseEngine
|
| 19 |
+
from gpu_mem_calculator.utils.precision import gb_from_bytes
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class FSDPEngine(BaseEngine):
|
| 23 |
+
"""PyTorch FSDP memory calculation.
|
| 24 |
+
|
| 25 |
+
FSDP shards model parameters, gradients, and optimizer states
|
| 26 |
+
across data parallel GPUs, similar to DeepSpeed ZeRO-3.
|
| 27 |
+
|
| 28 |
+
Sharding strategies:
|
| 29 |
+
- NO_SHARD: Equivalent to DDP (no sharding)
|
| 30 |
+
- SHARD_GRAD_OP: Shard gradients and optimizer states (like ZeRO-2)
|
| 31 |
+
- FULL_SHARD: Shard everything (like ZeRO-3)
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def calculate_memory(self) -> MemoryResult:
|
| 35 |
+
"""Calculate memory requirements for FSDP training.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
MemoryResult with complete memory breakdown
|
| 39 |
+
"""
|
| 40 |
+
sharding_strategy = self.engine_config.sharding_strategy
|
| 41 |
+
|
| 42 |
+
# Get largest layer params for FULL_SHARD
|
| 43 |
+
if self.model_config.largest_layer_params is None:
|
| 44 |
+
largest_layer_params = estimate_largest_layer_params(
|
| 45 |
+
hidden_size=self.model_config.hidden_size,
|
| 46 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 47 |
+
)
|
| 48 |
+
else:
|
| 49 |
+
largest_layer_params = self.model_config.largest_layer_params
|
| 50 |
+
|
| 51 |
+
match sharding_strategy:
|
| 52 |
+
case "no_shard":
|
| 53 |
+
return self._calculate_no_shard()
|
| 54 |
+
case "shard_grad_op":
|
| 55 |
+
return self._calculate_shard_grad_op()
|
| 56 |
+
case "full_shard":
|
| 57 |
+
return self._calculate_full_shard(largest_layer_params)
|
| 58 |
+
case _:
|
| 59 |
+
# Default to full shard
|
| 60 |
+
return self._calculate_full_shard(largest_layer_params)
|
| 61 |
+
|
| 62 |
+
def _calculate_no_shard(self) -> MemoryResult:
|
| 63 |
+
"""Calculate memory for NO_SHARD (same as DDP).
|
| 64 |
+
|
| 65 |
+
No sharding - each GPU holds a full copy of the model.
|
| 66 |
+
"""
|
| 67 |
+
# Import PyTorch DDP engine
|
| 68 |
+
from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
|
| 69 |
+
|
| 70 |
+
ddp_engine = PyTorchDDPEngine(
|
| 71 |
+
model_config=self.model_config,
|
| 72 |
+
training_config=self.training_config,
|
| 73 |
+
parallelism_config=self.parallelism_config,
|
| 74 |
+
engine_config=self.engine_config,
|
| 75 |
+
gpu_config=self.gpu_config,
|
| 76 |
+
)
|
| 77 |
+
return ddp_engine.calculate_memory()
|
| 78 |
+
|
| 79 |
+
def _calculate_shard_grad_op(self) -> MemoryResult:
|
| 80 |
+
"""Calculate memory for SHARD_GRAD_OP.
|
| 81 |
+
|
| 82 |
+
Shards gradients and optimizer states across GPUs.
|
| 83 |
+
Similar to DeepSpeed ZeRO-2.
|
| 84 |
+
|
| 85 |
+
Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
|
| 86 |
+
Reference: https://blog.eleuther.ai/transformer-math/
|
| 87 |
+
|
| 88 |
+
Memory formula:
|
| 89 |
+
- Model parameters: Full model on each GPU (not sharded)
|
| 90 |
+
- Gradients: Sharded across GPUs
|
| 91 |
+
- Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW)
|
| 92 |
+
|
| 93 |
+
Note: Optimizer states = 12 bytes per param for Adam/AdamW
|
| 94 |
+
- 4 bytes: FP32 parameter copy
|
| 95 |
+
- 4 bytes: Momentum (FP32)
|
| 96 |
+
- 4 bytes: Variance (FP32)
|
| 97 |
+
"""
|
| 98 |
+
num_params = self.model_config.num_parameters
|
| 99 |
+
num_gpus = self.total_num_gpus
|
| 100 |
+
|
| 101 |
+
# Model parameters (full model on each GPU)
|
| 102 |
+
model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16
|
| 103 |
+
|
| 104 |
+
# Gradients (sharded)
|
| 105 |
+
gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 106 |
+
|
| 107 |
+
# Optimizer states (sharded) - 12 bytes per param for Adam/AdamW
|
| 108 |
+
optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus) # FP32
|
| 109 |
+
|
| 110 |
+
# Activations
|
| 111 |
+
activations_gb = calculate_activation_memory(
|
| 112 |
+
batch_size=self.training_config.batch_size,
|
| 113 |
+
seq_len=self.model_config.max_seq_len,
|
| 114 |
+
hidden_size=self.model_config.hidden_size,
|
| 115 |
+
num_layers=self.model_config.num_layers,
|
| 116 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 117 |
+
tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
|
| 118 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 119 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 120 |
+
num_experts=self.model_config.num_experts,
|
| 121 |
+
top_k=self.model_config.top_k,
|
| 122 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# Overhead
|
| 126 |
+
base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
|
| 127 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 128 |
+
|
| 129 |
+
breakdown = MemoryBreakdown(
|
| 130 |
+
model_params_gb=model_params_gb,
|
| 131 |
+
gradients_gb=gradients_gb,
|
| 132 |
+
optimizer_states_gb=optimizer_gb,
|
| 133 |
+
activations_gb=activations_gb,
|
| 134 |
+
overhead_gb=overhead_gb,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
return self._create_result(breakdown)
|
| 138 |
+
|
| 139 |
+
def _calculate_full_shard(self, largest_layer_params: int) -> MemoryResult:
|
| 140 |
+
"""Calculate memory for FULL_SHARD.
|
| 141 |
+
|
| 142 |
+
Shards parameters, gradients, and optimizer states.
|
| 143 |
+
Similar to DeepSpeed ZeRO-3.
|
| 144 |
+
|
| 145 |
+
Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
|
| 146 |
+
Reference: https://blog.eleuther.ai/transformer-math/
|
| 147 |
+
|
| 148 |
+
Memory formula:
|
| 149 |
+
- Largest layer: 4 * largest_layer_params (fp16 params + fp16 grads)
|
| 150 |
+
- Remaining parameters and gradients: Sharded across GPUs (2 bytes fp16 each)
|
| 151 |
+
- Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW in FP32)
|
| 152 |
+
|
| 153 |
+
Total per GPU: largest_layer_memory + 2 * params / num_gpus +
|
| 154 |
+
2 * params / num_gpus + 12 * params / num_gpus
|
| 155 |
+
= largest_layer_memory + 16 * params / num_gpus
|
| 156 |
+
|
| 157 |
+
Note: FSDP typically uses 12 bytes for optimizer states (not 16 like DeepSpeed ZeRO-3)
|
| 158 |
+
because FSDP doesn't keep an additional FP32 gradient copy in the optimizer states.
|
| 159 |
+
"""
|
| 160 |
+
num_params = self.model_config.num_parameters
|
| 161 |
+
num_gpus = self.total_num_gpus
|
| 162 |
+
|
| 163 |
+
# Largest layer memory (fp16 params + fp16 grads gathered during compute)
|
| 164 |
+
largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
|
| 165 |
+
|
| 166 |
+
# Sharded parameters (fp16)
|
| 167 |
+
params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 168 |
+
|
| 169 |
+
# Sharded gradients (fp16)
|
| 170 |
+
gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
|
| 171 |
+
|
| 172 |
+
# Sharded optimizer states (FP32 for Adam/AdamW)
|
| 173 |
+
# 12 bytes per param: 4 bytes fp32 params copy + 4 bytes momentum + 4 bytes variance
|
| 174 |
+
optimizer_per_gpu_gb = gb_from_bytes((num_params * 12) / num_gpus)
|
| 175 |
+
|
| 176 |
+
# Model params in breakdown: largest layer (gathered) + sharded params
|
| 177 |
+
# This represents the total parameter memory on each GPU
|
| 178 |
+
model_params_gb = largest_layer_memory_gb + params_per_gpu_gb
|
| 179 |
+
|
| 180 |
+
# Activations
|
| 181 |
+
activations_gb = calculate_activation_memory(
|
| 182 |
+
batch_size=self.training_config.batch_size,
|
| 183 |
+
seq_len=self.model_config.max_seq_len,
|
| 184 |
+
hidden_size=self.model_config.hidden_size,
|
| 185 |
+
num_layers=self.model_config.num_layers,
|
| 186 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 187 |
+
tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
|
| 188 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 189 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 190 |
+
num_experts=self.model_config.num_experts,
|
| 191 |
+
top_k=self.model_config.top_k,
|
| 192 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Overhead
|
| 196 |
+
base_memory = (
|
| 197 |
+
largest_layer_memory_gb
|
| 198 |
+
+ params_per_gpu_gb
|
| 199 |
+
+ gradients_per_gpu_gb
|
| 200 |
+
+ optimizer_per_gpu_gb
|
| 201 |
+
+ activations_gb
|
| 202 |
+
)
|
| 203 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 204 |
+
|
| 205 |
+
breakdown = MemoryBreakdown(
|
| 206 |
+
model_params_gb=model_params_gb,
|
| 207 |
+
gradients_gb=gradients_per_gpu_gb,
|
| 208 |
+
optimizer_states_gb=optimizer_per_gpu_gb,
|
| 209 |
+
activations_gb=activations_gb,
|
| 210 |
+
overhead_gb=overhead_gb,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
return self._create_result(breakdown)
|
src/gpu_mem_calculator/engines/megatron.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Megatron-LM engine implementation.
|
| 2 |
+
|
| 3 |
+
Implements memory calculations for Megatron-LM with tensor, pipeline,
|
| 4 |
+
and sequence parallelism.
|
| 5 |
+
|
| 6 |
+
Reference: https://github.com/NVIDIA/Megatron-LM
|
| 7 |
+
Reference: https://arxiv.org/abs/1909.08053
|
| 8 |
+
Reference: https://blog.eleuther.ai/transformer-math/
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from gpu_mem_calculator.core.formulas import (
|
| 12 |
+
calculate_activation_memory,
|
| 13 |
+
calculate_gradient_memory,
|
| 14 |
+
calculate_optimizer_memory,
|
| 15 |
+
calculate_overhead,
|
| 16 |
+
calculate_parameter_memory,
|
| 17 |
+
)
|
| 18 |
+
from gpu_mem_calculator.core.models import (
|
| 19 |
+
MemoryBreakdown,
|
| 20 |
+
MemoryResult,
|
| 21 |
+
)
|
| 22 |
+
from gpu_mem_calculator.engines.base import BaseEngine
|
| 23 |
+
from gpu_mem_calculator.utils.precision import gb_from_bytes
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class MegatronLMEngine(BaseEngine):
|
| 27 |
+
"""Megatron-LM memory calculation.
|
| 28 |
+
|
| 29 |
+
Megatron-LM uses tensor parallelism to split individual layers across GPUs,
|
| 30 |
+
and optionally pipeline parallelism to split layers across GPUs.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def calculate_memory(self) -> MemoryResult:
|
| 34 |
+
"""Calculate memory requirements for Megatron-LM training.
|
| 35 |
+
|
| 36 |
+
Megatron-LM memory characteristics:
|
| 37 |
+
- Parameters are sharded across tensor parallel GPUs
|
| 38 |
+
- Gradients are sharded across tensor parallel GPUs
|
| 39 |
+
- Optimizer states can be sharded or replicated
|
| 40 |
+
- Activations depend on tensor/pipeline/sequence parallelism
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
MemoryResult with complete memory breakdown
|
| 44 |
+
"""
|
| 45 |
+
tp_size = self.parallelism_config.tensor_parallel_size
|
| 46 |
+
pp_size = self.parallelism_config.pipeline_parallel_size
|
| 47 |
+
seq_parallel = self.parallelism_config.sequence_parallel
|
| 48 |
+
|
| 49 |
+
# 1. Model parameters (sharded by tensor parallelism)
|
| 50 |
+
# Each TP GPU holds 1/tp of the parameters
|
| 51 |
+
params_per_gpu = self.model_config.num_parameters / tp_size
|
| 52 |
+
model_params_gb = calculate_parameter_memory(
|
| 53 |
+
num_params=int(params_per_gpu),
|
| 54 |
+
dtype=self.training_config.dtype.value,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# 2. Gradients (sharded by tensor parallelism)
|
| 58 |
+
gradients_gb = calculate_gradient_memory(
|
| 59 |
+
num_params=int(params_per_gpu),
|
| 60 |
+
dtype=self.training_config.dtype.value,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# 3. Optimizer states
|
| 64 |
+
# In Megatron-LM, optimizer states are typically sharded similarly to parameters
|
| 65 |
+
# for tensor parallelism, but this can vary based on configuration
|
| 66 |
+
optimizer_gb = calculate_optimizer_memory(
|
| 67 |
+
num_params=int(params_per_gpu),
|
| 68 |
+
optimizer=self.training_config.optimizer.value,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# 4. Activations
|
| 72 |
+
# Activations are affected by:
|
| 73 |
+
# - Tensor parallelism: splits activations across TP GPUs
|
| 74 |
+
# - Pipeline parallelism: only holds activations for current stage
|
| 75 |
+
# - Sequence parallelism: splits sequence dimension
|
| 76 |
+
activations_gb = self._calculate_megatron_activations(
|
| 77 |
+
tp_size=tp_size,
|
| 78 |
+
pp_size=pp_size,
|
| 79 |
+
seq_parallel=seq_parallel,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# 5. Overhead
|
| 83 |
+
base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
|
| 84 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 85 |
+
|
| 86 |
+
breakdown = MemoryBreakdown(
|
| 87 |
+
model_params_gb=model_params_gb,
|
| 88 |
+
gradients_gb=gradients_gb,
|
| 89 |
+
optimizer_states_gb=optimizer_gb,
|
| 90 |
+
activations_gb=activations_gb,
|
| 91 |
+
overhead_gb=overhead_gb,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
return self._create_result(breakdown)
|
| 95 |
+
|
| 96 |
+
def _calculate_megatron_activations(
|
| 97 |
+
self,
|
| 98 |
+
tp_size: int,
|
| 99 |
+
pp_size: int,
|
| 100 |
+
seq_parallel: bool,
|
| 101 |
+
) -> float:
|
| 102 |
+
"""Calculate activation memory for Megatron-LM.
|
| 103 |
+
|
| 104 |
+
Megatron-LM activations are affected by parallelism strategy:
|
| 105 |
+
- Tensor parallelism: splits hidden dimension
|
| 106 |
+
- Pipeline parallelism: only current stage's activations
|
| 107 |
+
- Sequence parallelism: splits sequence dimension
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
tp_size: Tensor parallelism size
|
| 111 |
+
pp_size: Pipeline parallelism size
|
| 112 |
+
seq_parallel: Whether sequence parallelism is enabled
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
Activation memory in GB
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
# Base activation memory
|
| 119 |
+
base_activations = calculate_activation_memory(
|
| 120 |
+
batch_size=self.training_config.batch_size,
|
| 121 |
+
seq_len=self.model_config.max_seq_len,
|
| 122 |
+
hidden_size=self.model_config.hidden_size,
|
| 123 |
+
num_layers=self.model_config.num_layers,
|
| 124 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 125 |
+
tensor_parallel_size=tp_size,
|
| 126 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 127 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 128 |
+
num_experts=self.model_config.num_experts,
|
| 129 |
+
top_k=self.model_config.top_k,
|
| 130 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Adjust for pipeline parallelism
|
| 134 |
+
# Each PP stage only holds num_layers / pp_size layers
|
| 135 |
+
pp_factor = 1.0 / pp_size
|
| 136 |
+
|
| 137 |
+
# Adjust for sequence parallelism
|
| 138 |
+
# If enabled, splits sequence dimension across TP GPUs
|
| 139 |
+
if seq_parallel and tp_size > 1:
|
| 140 |
+
seq_factor = 1.0 / tp_size
|
| 141 |
+
else:
|
| 142 |
+
seq_factor = 1.0
|
| 143 |
+
|
| 144 |
+
return base_activations * pp_factor * seq_factor
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class MegatronDeepSpeedEngine(BaseEngine):
|
| 148 |
+
"""Megatron-LM + DeepSpeed combined engine.
|
| 149 |
+
|
| 150 |
+
This combines Megatron-LM's tensor/pipeline parallelism with
|
| 151 |
+
DeepSpeed ZeRO's optimizer/gradient sharding.
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
def calculate_memory(self) -> MemoryResult:
|
| 155 |
+
"""Calculate memory for Megatron-LM + DeepSpeed.
|
| 156 |
+
|
| 157 |
+
This uses:
|
| 158 |
+
- Megatron-LM for tensor/pipeline parallelism and activation memory
|
| 159 |
+
- DeepSpeed ZeRO for optimizer/gradient sharding
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
MemoryResult with complete memory breakdown
|
| 163 |
+
"""
|
| 164 |
+
# Import DeepSpeed engine
|
| 165 |
+
|
| 166 |
+
# First calculate activation memory using Megatron-LM approach
|
| 167 |
+
tp_size = self.parallelism_config.tensor_parallel_size
|
| 168 |
+
pp_size = self.parallelism_config.pipeline_parallel_size
|
| 169 |
+
seq_parallel = self.parallelism_config.sequence_parallel
|
| 170 |
+
|
| 171 |
+
activations_gb = self._calculate_megatron_activations(
|
| 172 |
+
tp_size=tp_size,
|
| 173 |
+
pp_size=pp_size,
|
| 174 |
+
seq_parallel=seq_parallel,
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# For parameters, gradients, optimizer - use DeepSpeed ZeRO logic
|
| 178 |
+
# But account for tensor parallelism (parameters are already split by TP)
|
| 179 |
+
tp_size = self.parallelism_config.tensor_parallel_size
|
| 180 |
+
params_per_gpu = self.model_config.num_parameters / tp_size
|
| 181 |
+
|
| 182 |
+
zero_stage = self.engine_config.zero_stage or 2
|
| 183 |
+
offload_optimizer = self.engine_config.offload_optimizer
|
| 184 |
+
|
| 185 |
+
# Model parameters (sharded by TP, then possibly by ZeRO)
|
| 186 |
+
if zero_stage >= 3:
|
| 187 |
+
# ZeRO-3 shards further
|
| 188 |
+
dp_size = self.parallelism_config.data_parallel_size
|
| 189 |
+
model_params_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
|
| 190 |
+
else:
|
| 191 |
+
# ZeRO-0/1/2 keeps parameters on each TP GPU
|
| 192 |
+
model_params_gb = gb_from_bytes(params_per_gpu * 2)
|
| 193 |
+
|
| 194 |
+
# Gradients
|
| 195 |
+
if zero_stage >= 2:
|
| 196 |
+
dp_size = self.parallelism_config.data_parallel_size
|
| 197 |
+
gradients_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
|
| 198 |
+
else:
|
| 199 |
+
gradients_gb = gb_from_bytes(params_per_gpu * 2)
|
| 200 |
+
|
| 201 |
+
# Optimizer states (12 bytes per param for Adam/AdamW in FP32)
|
| 202 |
+
if offload_optimizer.value == "cpu":
|
| 203 |
+
optimizer_gb = 0.0
|
| 204 |
+
else:
|
| 205 |
+
if zero_stage >= 1:
|
| 206 |
+
dp_size = self.parallelism_config.data_parallel_size
|
| 207 |
+
optimizer_gb = gb_from_bytes((params_per_gpu * 12) / dp_size)
|
| 208 |
+
else:
|
| 209 |
+
optimizer_gb = gb_from_bytes(params_per_gpu * 12)
|
| 210 |
+
|
| 211 |
+
# Overhead
|
| 212 |
+
base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
|
| 213 |
+
overhead_gb = gb_from_bytes(base_memory * 0.2)
|
| 214 |
+
|
| 215 |
+
breakdown = MemoryBreakdown(
|
| 216 |
+
model_params_gb=model_params_gb,
|
| 217 |
+
gradients_gb=gradients_gb,
|
| 218 |
+
optimizer_states_gb=optimizer_gb,
|
| 219 |
+
activations_gb=activations_gb,
|
| 220 |
+
overhead_gb=overhead_gb,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
return self._create_result(breakdown)
|
| 224 |
+
|
| 225 |
+
def _calculate_megatron_activations(
|
| 226 |
+
self,
|
| 227 |
+
tp_size: int,
|
| 228 |
+
pp_size: int,
|
| 229 |
+
seq_parallel: bool,
|
| 230 |
+
) -> float:
|
| 231 |
+
"""Calculate activation memory for Megatron-LM."""
|
| 232 |
+
|
| 233 |
+
# Base activation memory
|
| 234 |
+
base_activations = calculate_activation_memory(
|
| 235 |
+
batch_size=self.training_config.batch_size,
|
| 236 |
+
seq_len=self.model_config.max_seq_len,
|
| 237 |
+
hidden_size=self.model_config.hidden_size,
|
| 238 |
+
num_layers=self.model_config.num_layers,
|
| 239 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 240 |
+
tensor_parallel_size=tp_size,
|
| 241 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 242 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 243 |
+
num_experts=self.model_config.num_experts,
|
| 244 |
+
top_k=self.model_config.top_k,
|
| 245 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Adjust for pipeline parallelism
|
| 249 |
+
pp_factor = 1.0 / pp_size
|
| 250 |
+
|
| 251 |
+
# Adjust for sequence parallelism
|
| 252 |
+
if seq_parallel and tp_size > 1:
|
| 253 |
+
seq_factor = 1.0 / tp_size
|
| 254 |
+
else:
|
| 255 |
+
seq_factor = 1.0
|
| 256 |
+
|
| 257 |
+
return base_activations * pp_factor * seq_factor
|
src/gpu_mem_calculator/engines/pytorch.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PyTorch DDP (Distributed Data Parallel) engine implementation.
|
| 2 |
+
|
| 3 |
+
This is the baseline implementation without any memory optimizations.
|
| 4 |
+
|
| 5 |
+
Reference: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
|
| 6 |
+
Reference: https://blog.eleuther.ai/transformer-math/
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from gpu_mem_calculator.core.formulas import (
|
| 10 |
+
calculate_activation_memory,
|
| 11 |
+
calculate_gradient_memory,
|
| 12 |
+
calculate_optimizer_memory,
|
| 13 |
+
calculate_overhead,
|
| 14 |
+
calculate_parameter_memory,
|
| 15 |
+
)
|
| 16 |
+
from gpu_mem_calculator.core.models import (
|
| 17 |
+
MemoryBreakdown,
|
| 18 |
+
MemoryResult,
|
| 19 |
+
)
|
| 20 |
+
from gpu_mem_calculator.engines.base import BaseEngine
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class PyTorchDDPEngine(BaseEngine):
|
| 24 |
+
"""PyTorch DDP memory calculation.
|
| 25 |
+
|
| 26 |
+
DDP replicates the model on each GPU, so memory is not sharded.
|
| 27 |
+
Each GPU holds a full copy of the model, gradients, and optimizer states.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def calculate_memory(self) -> MemoryResult:
|
| 31 |
+
"""Calculate memory requirements for PyTorch DDP training.
|
| 32 |
+
|
| 33 |
+
For DDP:
|
| 34 |
+
- Model parameters: Full model on each GPU
|
| 35 |
+
- Gradients: Full gradients on each GPU
|
| 36 |
+
- Optimizer states: Full optimizer states on each GPU (FP32)
|
| 37 |
+
- Activations: Batch size dependent, split by data parallel
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
MemoryResult with complete memory breakdown
|
| 41 |
+
"""
|
| 42 |
+
# 1. Model parameters (in the specified dtype)
|
| 43 |
+
model_params_gb = calculate_parameter_memory(
|
| 44 |
+
num_params=self.model_config.num_parameters,
|
| 45 |
+
dtype=self.training_config.dtype.value,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# 2. Gradients (same precision as parameters for mixed precision)
|
| 49 |
+
gradients_gb = calculate_gradient_memory(
|
| 50 |
+
num_params=self.model_config.num_parameters,
|
| 51 |
+
dtype=self.training_config.dtype.value,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# 3. Optimizer states (always FP32 for Adam/AdamW)
|
| 55 |
+
optimizer_gb = calculate_optimizer_memory(
|
| 56 |
+
num_params=self.model_config.num_parameters,
|
| 57 |
+
optimizer=self.training_config.optimizer.value,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# 4. Activations (depends on batch size and model architecture)
|
| 61 |
+
activations_gb = calculate_activation_memory(
|
| 62 |
+
batch_size=self.training_config.batch_size,
|
| 63 |
+
seq_len=self.model_config.max_seq_len,
|
| 64 |
+
hidden_size=self.model_config.hidden_size,
|
| 65 |
+
num_layers=self.model_config.num_layers,
|
| 66 |
+
num_attention_heads=self.model_config.num_attention_heads,
|
| 67 |
+
tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
|
| 68 |
+
activation_checkpointing=self.training_config.activation_checkpointing,
|
| 69 |
+
moe_enabled=self.model_config.moe_enabled,
|
| 70 |
+
num_experts=self.model_config.num_experts,
|
| 71 |
+
top_k=self.model_config.top_k,
|
| 72 |
+
expert_intermediate_size=self.model_config.expert_intermediate_size,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# 5. Calculate overhead
|
| 76 |
+
base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
|
| 77 |
+
overhead_gb = calculate_overhead(base_memory)
|
| 78 |
+
|
| 79 |
+
# Create breakdown
|
| 80 |
+
breakdown = MemoryBreakdown(
|
| 81 |
+
model_params_gb=model_params_gb,
|
| 82 |
+
gradients_gb=gradients_gb,
|
| 83 |
+
optimizer_states_gb=optimizer_gb,
|
| 84 |
+
activations_gb=activations_gb,
|
| 85 |
+
overhead_gb=overhead_gb,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
return self._create_result(breakdown)
|
src/gpu_mem_calculator/exporters/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Framework configuration exporters."""
|
| 2 |
+
|
| 3 |
+
from gpu_mem_calculator.exporters.accelerate import AccelerateExporter
|
| 4 |
+
from gpu_mem_calculator.exporters.axolotl import AxolotlExporter
|
| 5 |
+
from gpu_mem_calculator.exporters.lightning import LightningExporter
|
| 6 |
+
from gpu_mem_calculator.exporters.manager import ExportFormat, ExportManager
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"ExportManager",
|
| 10 |
+
"ExportFormat",
|
| 11 |
+
"AccelerateExporter",
|
| 12 |
+
"LightningExporter",
|
| 13 |
+
"AxolotlExporter",
|
| 14 |
+
]
|
src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (628 Bytes). View file
|
|
|
src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc
ADDED
|
Binary file (7.81 kB). View file
|
|
|
src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc
ADDED
|
Binary file (9.07 kB). View file
|
|
|
src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc
ADDED
|
Binary file (9.41 kB). View file
|
|
|