George Yang commited on
Commit
36ed1cd
·
1 Parent(s): 8e7e10d

Initial deployment: Add GPU Memory Calculator with Docker

Browse files

- Add FastAPI web application
- Add all calculator modules (training, inference, multi-node)
- Configure Docker for Python 3.12
- Add requirements.txt with web dependencies
- Add Space README with metadata

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +92 -0
  2. Dockerfile +40 -0
  3. README.md +57 -7
  4. cli/main.py +399 -0
  5. requirements.txt +12 -0
  6. src/gpu_mem_calculator.egg-info/PKG-INFO +720 -0
  7. src/gpu_mem_calculator.egg-info/SOURCES.txt +46 -0
  8. src/gpu_mem_calculator.egg-info/dependency_links.txt +1 -0
  9. src/gpu_mem_calculator.egg-info/entry_points.txt +2 -0
  10. src/gpu_mem_calculator.egg-info/requires.txt +16 -0
  11. src/gpu_mem_calculator.egg-info/top_level.txt +1 -0
  12. src/gpu_mem_calculator/__init__.py +3 -0
  13. src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc +0 -0
  14. src/gpu_mem_calculator/cli/__init__.py +5 -0
  15. src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc +0 -0
  16. src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc +0 -0
  17. src/gpu_mem_calculator/cli/main.py +399 -0
  18. src/gpu_mem_calculator/config/__init__.py +5 -0
  19. src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc +0 -0
  20. src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc +0 -0
  21. src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc +0 -0
  22. src/gpu_mem_calculator/config/parser.py +323 -0
  23. src/gpu_mem_calculator/config/presets.py +83 -0
  24. src/gpu_mem_calculator/core/__init__.py +24 -0
  25. src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc +0 -0
  26. src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc +0 -0
  27. src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc +0 -0
  28. src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc +0 -0
  29. src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc +0 -0
  30. src/gpu_mem_calculator/core/calculator.py +178 -0
  31. src/gpu_mem_calculator/core/formulas.py +268 -0
  32. src/gpu_mem_calculator/core/models.py +568 -0
  33. src/gpu_mem_calculator/core/multinode.py +308 -0
  34. src/gpu_mem_calculator/engines/__init__.py +16 -0
  35. src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc +0 -0
  36. src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc +0 -0
  37. src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc +0 -0
  38. src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc +0 -0
  39. src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc +0 -0
  40. src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc +0 -0
  41. src/gpu_mem_calculator/engines/base.py +220 -0
  42. src/gpu_mem_calculator/engines/deepspeed.py +316 -0
  43. src/gpu_mem_calculator/engines/fsdp.py +213 -0
  44. src/gpu_mem_calculator/engines/megatron.py +257 -0
  45. src/gpu_mem_calculator/engines/pytorch.py +88 -0
  46. src/gpu_mem_calculator/exporters/__init__.py +14 -0
  47. src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc +0 -0
  48. src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc +0 -0
  49. src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc +0 -0
  50. src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc +0 -0
.dockerignore ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+ .github
5
+
6
+ # Docker
7
+ Dockerfile
8
+ .dockerignore
9
+
10
+ # Python
11
+ __pycache__
12
+ *.py[cod]
13
+ *$py.class
14
+ *.so
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+
31
+ # Virtual environments
32
+ venv/
33
+ env/
34
+ ENV/
35
+ .venv/
36
+ .env
37
+
38
+ # Testing
39
+ .pytest_cache/
40
+ .coverage
41
+ coverage.xml
42
+ htmlcov/
43
+ .tox/
44
+ .mypy_cache/
45
+ .ruff_cache/
46
+
47
+ # IDEs
48
+ .vscode/
49
+ .idea/
50
+ *.swp
51
+ *.swo
52
+ *~
53
+
54
+ # OS
55
+ .DS_Store
56
+ Thumbs.db
57
+
58
+ # Claude
59
+ .claude/
60
+ .mcp.json
61
+
62
+ # Documentation (source files included, but skip extras)
63
+ docs/
64
+ *.md
65
+ !README.md
66
+
67
+ # Project specific
68
+ *.log
69
+ .env
70
+ .venv/
71
+
72
+ # CI/CD
73
+ CODE_OF_CONDUCT.md
74
+ CONTRIBUTING.md
75
+ MARKETING.md
76
+ SECURITY.md
77
+ CHANGELOG.md
78
+
79
+ # Screenshots and images
80
+ *.png
81
+ *.jpg
82
+ *.jpeg
83
+ *.gif
84
+ !screenshot.png
85
+
86
+ # Test files
87
+ tests/
88
+ examples/
89
+ configs/
90
+
91
+ # MCP server config
92
+ .mcp.json
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for Hugging Face Spaces
2
+ # GPU Memory Calculator - FastAPI Web Application
3
+
4
+ FROM python:3.12-slim
5
+
6
+ # Set working directory
7
+ WORKDIR /app
8
+
9
+ # Set environment variables
10
+ ENV PYTHONUNBUFFERED=1 \
11
+ PYTHONDONTWRITEBYTECODE=1 \
12
+ PORT=7860
13
+
14
+ # Install system dependencies
15
+ RUN apt-get update && \
16
+ apt-get install -y --no-install-recommends \
17
+ gcc \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ # Copy requirements first for better Docker layer caching
21
+ COPY requirements.txt .
22
+
23
+ # Install Python dependencies
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Copy project files
27
+ COPY . .
28
+
29
+ # Install the package in editable mode
30
+ RUN pip install --no-cache-dir -e .
31
+
32
+ # Expose Hugging Face Spaces default port
33
+ EXPOSE 7860
34
+
35
+ # Health check endpoint
36
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
37
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/').read()"
38
+
39
+ # Run the FastAPI application with uvicorn
40
+ CMD ["uvicorn", "web.app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,62 @@
1
  ---
2
- title: Gpu Memory Calculator
3
- emoji: 😻
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
- short_description: Calculates GPU memory for training, inference, and more
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: GPU Memory Calculator
3
+ emoji: 🎮
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
+ license: mit
 
9
  ---
10
 
11
+ # GPU Memory Calculator
12
+
13
+ Calculate GPU memory requirements for training and running Large Language Models (LLMs). Supports multiple training engines (PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, FSDP), inference engines (HuggingFace, vLLM, TGI, TensorRT-LLM, SGLang), and multi-node training configurations.
14
+
15
+ ## Features
16
+
17
+ - **Training Memory Calculation**: Calculate memory for PyTorch DDP, DeepSpeed ZeRO (0-3), Megatron-LM, FSDP, and hybrid approaches
18
+ - **Inference Memory Calculation**: Estimate memory requirements for HuggingFace Transformers, vLLM, TGI, TensorRT-LLM, and SGLang
19
+ - **Multi-Node Support**: Calculate network overhead for distributed training across multiple nodes
20
+ - **Model Presets**: Pre-configured settings for popular models (LLaMA 2, GPT-3, Mixtral, GLM, Qwen, DeepSeek-MoE)
21
+ - **Configuration Export**: Generate configs for Accelerate, Lightning, Axolotl, DeepSpeed, YAML, and JSON
22
+ - **Batch Size Optimization**: Automatically find the maximum batch size that fits in GPU memory
23
+
24
+ ## Supported Training Engines
25
+
26
+ - PyTorch DDP (Distributed Data Parallel)
27
+ - DeepSpeed ZeRO (Stages 0-3) with CPU/NVMe offloading
28
+ - Megatron-LM (Tensor + Pipeline Parallelism)
29
+ - PyTorch FSDP (Fully Sharded Data Parallel)
30
+ - Megatron-LM + DeepSpeed (Hybrid)
31
+
32
+ ## Supported Inference Engines
33
+
34
+ - HuggingFace Transformers
35
+ - vLLM (PagedAttention)
36
+ - Text Generation Inference (TGI)
37
+ - TensorRT-LLM
38
+ - SGLang (RadixAttention)
39
+
40
+ ## How to Use
41
+
42
+ 1. **Select a preset model** or configure your own
43
+ 2. **Choose training/inference engine** and adjust parameters
44
+ 3. **Calculate** memory requirements instantly
45
+ 4. **Export** configurations to your preferred framework
46
+
47
+ ## Example Use Cases
48
+
49
+ - Planning GPU requirements for LLM training
50
+ - Optimizing batch sizes for your hardware
51
+ - Comparing memory efficiency across engines
52
+ - Estimating KV cache memory for inference
53
+ - Calculating multi-node network overhead
54
+
55
+ ## Links
56
+
57
+ - [GitHub Repository](https://github.com/George614/gpu-mem-calculator)
58
+ - [Documentation](https://github.com/George614/gpu-mem-calculator/blob/main/README.md)
59
+
60
+ ## License
61
+
62
+ MIT License - see [LICENSE](https://github.com/George614/gpu-mem-calculator/blob/main/LICENSE) for details.
cli/main.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI interface for GPU Memory Calculator."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ import click
9
+
10
+ if TYPE_CHECKING:
11
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
12
+ from gpu_mem_calculator.core.models import MemoryResult
13
+
14
+
15
+ @click.group()
16
+ @click.version_option(version="0.1.0")
17
+ def main() -> None:
18
+ """GPU Memory Calculator for LLM Training.
19
+
20
+ Calculate GPU memory requirements for training Large Language Models
21
+ with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
22
+ """
23
+ pass
24
+
25
+
26
+ @main.command()
27
+ @click.option(
28
+ "--config",
29
+ "-c",
30
+ type=click.Path(exists=True),
31
+ help="Path to JSON configuration file",
32
+ )
33
+ @click.option(
34
+ "--preset",
35
+ "-p",
36
+ type=str,
37
+ help="Name of a preset model configuration",
38
+ )
39
+ @click.option(
40
+ "--output",
41
+ "-o",
42
+ type=click.Path(),
43
+ help="Output file path (default: stdout)",
44
+ )
45
+ @click.option(
46
+ "--format",
47
+ "-f",
48
+ type=click.Choice(["json", "yaml", "table"]),
49
+ default="table",
50
+ help="Output format (default: table)",
51
+ )
52
+ def calculate(
53
+ config: str | None,
54
+ preset: str | None,
55
+ output: str | None,
56
+ format: Literal["json", "yaml", "table"],
57
+ ) -> None:
58
+ """Calculate GPU memory requirements from config file or preset.
59
+
60
+ Examples:
61
+ gpu-mem-calc calculate --config configs/llama2_7b.json
62
+ gpu-mem-calc calculate --preset llama2-7b
63
+ gpu-mem-calc calculate -p mixtral-8x7b --format json
64
+ """
65
+ if not config and not preset:
66
+ click.echo("Error: Either --config or --preset is required", err=True)
67
+ sys.exit(1)
68
+
69
+ if config and preset:
70
+ click.echo("Error: Cannot use both --config and --preset", err=True)
71
+ sys.exit(1)
72
+
73
+ try:
74
+ import tempfile
75
+
76
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
77
+
78
+ if preset:
79
+ # Load preset configuration
80
+ from gpu_mem_calculator.config.presets import get_preset_config
81
+
82
+ preset_config = get_preset_config(preset)
83
+ if preset_config is None:
84
+ click.echo(
85
+ f"Error: Preset '{preset}' not found. "
86
+ "Use 'gpu-mem-calc presets' to list available presets.",
87
+ err=True,
88
+ )
89
+ sys.exit(1)
90
+
91
+ # Write preset to temp file for from_config_file
92
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
93
+ json.dump(preset_config, f, indent=2)
94
+ temp_path = f.name
95
+
96
+ calculator = GPUMemoryCalculator.from_config_file(temp_path)
97
+ Path(temp_path).unlink() # Clean up temp file
98
+ elif config:
99
+ calculator = GPUMemoryCalculator.from_config_file(config)
100
+ else:
101
+ # This should never happen due to the checks above
102
+ click.echo("Error: Either --config or --preset is required", err=True)
103
+ sys.exit(1)
104
+
105
+ result = calculator.calculate()
106
+
107
+ # Format output
108
+ if format == "json":
109
+ output_text = json.dumps(result.model_dump(mode="json"), indent=2)
110
+ elif format == "yaml":
111
+ try:
112
+ import yaml # type: ignore[import-untyped]
113
+
114
+ output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
115
+ except ImportError:
116
+ click.echo(
117
+ "Error: YAML format requires PyYAML. Install with: pip install pyyaml",
118
+ err=True,
119
+ )
120
+ sys.exit(1)
121
+ else: # table
122
+ output_text = _format_result_as_table(result, calculator)
123
+
124
+ # Write output
125
+ if output:
126
+ Path(output).write_text(output_text)
127
+ click.echo(f"Results written to {output}")
128
+ else:
129
+ click.echo(output_text)
130
+
131
+ except Exception as e:
132
+ click.echo(f"Error: {e}", err=True)
133
+ sys.exit(1)
134
+
135
+
136
+ @main.command()
137
+ @click.argument(
138
+ "params",
139
+ type=float,
140
+ required=True,
141
+ )
142
+ @click.option(
143
+ "--gpus",
144
+ "-g",
145
+ type=int,
146
+ default=1,
147
+ help="Number of GPUs (default: 1)",
148
+ )
149
+ @click.option(
150
+ "--gpu-mem",
151
+ "-m",
152
+ type=float,
153
+ default=80.0,
154
+ help="GPU memory in GB (default: 80.0)",
155
+ )
156
+ @click.option(
157
+ "--engine",
158
+ "-e",
159
+ type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
160
+ default="pytorch",
161
+ help="Training engine (default: pytorch)",
162
+ )
163
+ @click.option(
164
+ "--dtype",
165
+ "-d",
166
+ type=click.Choice(["fp32", "fp16", "bf16"]),
167
+ default="bf16",
168
+ help="Data type (default: bf16)",
169
+ )
170
+ def quick(
171
+ params: float,
172
+ gpus: int,
173
+ gpu_mem: float,
174
+ engine: str,
175
+ dtype: str,
176
+ ) -> None:
177
+ """Quick calculation from model size (in billions of parameters).
178
+
179
+ Example:
180
+ gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
181
+ """
182
+ try:
183
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
184
+ from gpu_mem_calculator.core.models import (
185
+ DType,
186
+ EngineConfig,
187
+ EngineType,
188
+ GPUConfig,
189
+ ModelConfig,
190
+ ParallelismConfig,
191
+ TrainingConfig,
192
+ )
193
+
194
+ # Map engine string to EngineType
195
+ engine_map = {
196
+ "pytorch": EngineType.PYTORCH_DDP,
197
+ "deepspeed": EngineType.DEEPSPEED,
198
+ "megatron": EngineType.MEGATRON_LM,
199
+ "fsdp": EngineType.FSDP,
200
+ }
201
+
202
+ # Map dtype string to DType
203
+ dtype_map = {
204
+ "fp32": DType.FP32,
205
+ "fp16": DType.FP16,
206
+ "bf16": DType.BF16,
207
+ }
208
+
209
+ # Create a minimal config for quick calculation
210
+ # Estimate model architecture from parameter count
211
+ # Rough approximation based on typical transformer models
212
+ num_params = int(params * 1e9)
213
+
214
+ # Estimate hidden size and layers from param count
215
+ # These are rough approximations
216
+ if params <= 1:
217
+ hidden_size, num_layers = 768, 12
218
+ elif params <= 7:
219
+ hidden_size, num_layers = 4096, 32
220
+ elif params <= 13:
221
+ hidden_size, num_layers = 5120, 40
222
+ elif params <= 30:
223
+ hidden_size, num_layers = 6656, 60
224
+ elif params <= 65:
225
+ hidden_size, num_layers = 8192, 80
226
+ else:
227
+ hidden_size, num_layers = 12288, 96
228
+
229
+ model_config = ModelConfig(
230
+ name="quick-estimate",
231
+ num_parameters=num_params,
232
+ num_layers=num_layers,
233
+ hidden_size=hidden_size,
234
+ num_attention_heads=hidden_size // 128,
235
+ vocab_size=32000,
236
+ max_seq_len=2048,
237
+ )
238
+
239
+ training_config = TrainingConfig(
240
+ batch_size=1,
241
+ gradient_accumulation_steps=1,
242
+ dtype=dtype_map[dtype],
243
+ )
244
+
245
+ parallelism_config = ParallelismConfig(data_parallel_size=gpus)
246
+
247
+ engine_config = EngineConfig(
248
+ type=engine_map[engine],
249
+ zero_stage=2 if engine == "deepspeed" else None,
250
+ )
251
+
252
+ gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
253
+
254
+ calculator = GPUMemoryCalculator(
255
+ model_config=model_config,
256
+ training_config=training_config,
257
+ parallelism_config=parallelism_config,
258
+ engine_config=engine_config,
259
+ gpu_config=gpu_config,
260
+ )
261
+
262
+ result = calculator.calculate()
263
+
264
+ # Display results
265
+ click.echo(_format_result_as_table(result, calculator))
266
+
267
+ except Exception as e:
268
+ click.echo(f"Error: {e}", err=True)
269
+ sys.exit(1)
270
+
271
+
272
+ @main.command()
273
+ @click.argument(
274
+ "config_path",
275
+ type=click.Path(exists=True),
276
+ )
277
+ def validate(config_path: str) -> None:
278
+ """Validate a configuration file.
279
+
280
+ Example:
281
+ gpu-mem-calc validate configs/my_config.json
282
+ """
283
+ try:
284
+ from gpu_mem_calculator.config import ConfigParser
285
+
286
+ ConfigParser.parse_full_config(config_path)
287
+ click.echo(f"✓ Configuration file '{config_path}' is valid")
288
+
289
+ except Exception as e:
290
+ click.echo(f"✗ Validation failed: {e}", err=True)
291
+ sys.exit(1)
292
+
293
+
294
+ @main.command()
295
+ @click.option(
296
+ "--format",
297
+ "-f",
298
+ type=click.Choice(["list", "json", "table"]),
299
+ default="list",
300
+ help="Output format (default: list)",
301
+ )
302
+ def presets(format: str) -> None:
303
+ """List available model preset configurations.
304
+
305
+ Examples:
306
+ gpu-mem-calc presets
307
+ gpu-mem-calc presets --format table
308
+ gpu-mem-calc presets -f json
309
+ """
310
+ try:
311
+ from gpu_mem_calculator.config.presets import list_presets
312
+
313
+ all_presets = list_presets()
314
+
315
+ if not all_presets:
316
+ click.echo("No presets found.")
317
+ return
318
+
319
+ if format == "json":
320
+ click.echo(json.dumps(all_presets, indent=2))
321
+ elif format == "table":
322
+ from rich.console import Console
323
+ from rich.table import Table
324
+
325
+ console = Console()
326
+ table = Table(
327
+ title="Available Model Presets",
328
+ show_header=True,
329
+ header_style="bold magenta",
330
+ )
331
+ table.add_column("Preset Name", style="cyan", width=25)
332
+ table.add_column("Display Name", style="green", width=30)
333
+ table.add_column("Description", style="yellow")
334
+
335
+ for name, info in sorted(all_presets.items()):
336
+ table.add_row(name, info["display_name"], info["description"])
337
+
338
+ console.print(table)
339
+ else: # list format
340
+ click.echo("Available model presets:\n")
341
+ for name, info in sorted(all_presets.items()): # type: ignore[annotation-unchecked]
342
+ click.echo(f" {name:25} - {info['display_name']}")
343
+ if info.get("description"):
344
+ click.echo(f"{'':27}{info['description']}")
345
+ click.echo()
346
+
347
+ except Exception as e:
348
+ click.echo(f"Error: {e}", err=True)
349
+ sys.exit(1)
350
+
351
+
352
+ def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
353
+ """Format result as ASCII table."""
354
+ from rich.console import Console
355
+ from rich.table import Table
356
+
357
+ console = Console()
358
+
359
+ # Main results table
360
+ table = Table(
361
+ title="GPU Memory Calculation Results",
362
+ show_header=True,
363
+ header_style="bold magenta",
364
+ )
365
+ table.add_column("Metric", style="cyan", width=30)
366
+ table.add_column("Value", style="green")
367
+
368
+ # Memory results
369
+ table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
370
+ table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
371
+ table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
372
+ table.add_row("", "") # Spacer
373
+
374
+ # Breakdown
375
+ table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
376
+ table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
377
+ table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
378
+ table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
379
+ table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
380
+ table.add_row("", "") # Spacer
381
+
382
+ # Feasibility
383
+ status = "✓ Fits" if result.fits_on_gpu else "✗ OOM"
384
+ table.add_row("Status", status)
385
+ table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
386
+ if result.recommended_batch_size:
387
+ table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
388
+
389
+ # Capture table output
390
+ from io import StringIO
391
+
392
+ buffer = StringIO()
393
+ console.file = buffer
394
+ console.print(table)
395
+ return buffer.getvalue()
396
+
397
+
398
+ if __name__ == "__main__":
399
+ main()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPU Memory Calculator - Requirements for Hugging Face Spaces
2
+
3
+ # Core dependencies
4
+ pydantic>=2.0.0
5
+ click>=8.1.0
6
+ pydantic-settings>=2.0.0
7
+ rich>=13.0.0
8
+
9
+ # Web dependencies
10
+ fastapi>=0.100.0
11
+ uvicorn[standard]>=0.23.0
12
+ jinja2>=3.1.0
src/gpu_mem_calculator.egg-info/PKG-INFO ADDED
@@ -0,0 +1,720 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: gpu-mem-calculator
3
+ Version: 0.1.0
4
+ Summary: GPU Memory Calculator for LLM Training
5
+ Author: GPU Mem Calculator Team
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/George614/gpu-mem-calculator
8
+ Project-URL: Repository, https://github.com/George614/gpu-mem-calculator
9
+ Project-URL: Issues, https://github.com/George614/gpu-mem-calculator/issues
10
+ Keywords: gpu,memory,calculator,llm,large-language-model,training,deepspeed,megatron,pytorch,fsdp,transformer,machine-learning,deep-learning,distributed-training,zero-optimization
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: click>=8.1.0
25
+ Requires-Dist: pydantic-settings>=2.0.0
26
+ Requires-Dist: rich>=13.0.0
27
+ Provides-Extra: web
28
+ Requires-Dist: fastapi>=0.100.0; extra == "web"
29
+ Requires-Dist: uvicorn[standard]>=0.23.0; extra == "web"
30
+ Requires-Dist: jinja2>=3.1.0; extra == "web"
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
34
+ Requires-Dist: black>=23.0.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
+ Requires-Dist: mypy>=1.5.0; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # GPU Memory Calculator for LLM Training
40
+
41
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
42
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
43
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
44
+ [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
45
+
46
+ A versatile Python application for calculating GPU memory requirements for training Large Language Models with support for multiple training engines including PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, and FSDP.
47
+
48
+ 📖 **[Getting Started Guide](docs/GETTING_STARTED.md)** | 💬 **[FAQ](docs/FAQ.md)** | 🤝 **[Contributing](CONTRIBUTING.md)**
49
+
50
+ <p align="center">
51
+ <img src="screenshot.png" alt="GPU Memory Calculator Screenshot" width="800">
52
+ </p>
53
+
54
+ ## 🚀 Why Use This Tool?
55
+
56
+ Training large language models requires careful memory planning. This calculator helps you:
57
+
58
+ - **💰 Save costs** by determining the optimal GPU configuration before you start training
59
+ - **⚡ Avoid OOM errors** by validating your training configuration fits in GPU memory
60
+ - **📊 Compare strategies** across different training engines (DeepSpeed, Megatron, FSDP)
61
+ - **🎯 Plan infrastructure** by knowing exactly how many GPUs you need
62
+ - **📈 Scale efficiently** with detailed memory breakdowns for optimization
63
+
64
+ Whether you're training a 7B parameter model on a single GPU or a 175B model across hundreds of GPUs, this tool provides accurate memory estimates based on proven formulas from DeepSpeed, Megatron-LM, and the latest research.
65
+
66
+ ## ✨ Features
67
+
68
+ ### Core Training Calculation
69
+ - 🔧 **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 1-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
70
+ - 🖥️ **Dual Interface**: Both CLI and Web UI for flexible usage
71
+ - 🎯 **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, etc.)
72
+ - 📊 **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
73
+ - ✅ **Feasibility Analysis**: Check if your configuration fits on available GPU memory
74
+ - ⚙️ **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
75
+
76
+ ### 🆕 Inference Memory Calculation
77
+ - 🚀 **Multi-Engine Support**: HuggingFace Transformers, vLLM, TGI, TensorRT-LLM
78
+ - 💾 **KV Cache Optimization**: Quantization options (NONE, INT8, FP8, INT4)
79
+ - 🔄 **Tensor Parallelism**: Automatic memory distribution across GPUs
80
+ - 📈 **Throughput Estimation**: Tokens/second estimates for capacity planning
81
+ - 🎯 **Batch Size Optimization**: Find maximum batch size for your hardware
82
+
83
+ ### 🆕 Multi-Node Training
84
+ - 🌐 **Network Overhead Calculation**: AllReduce, AllGather, ReduceScatter, pipeline communication
85
+ - 📡 **Interconnect Support**: InfiniBand, NVLink, Ethernet (10G/25G/100G/200G)
86
+ - ⚡ **Hybrid Parallelism Optimization**: Automatic TP+PP+DP strategy optimization
87
+ - 🔧 **ZeRO Stage Impact Analysis**: Compare communication overhead across ZeRO stages
88
+
89
+ ### 🆕 Framework Configuration Exporters
90
+ - 📦 **Accelerate Export**: HuggingFace Accelerate config generation
91
+ - ⚡ **Lightning Export**: PyTorch Lightning Trainer configuration
92
+ - 🔥 **Axolotl Export**: YAML config for fine-tuning
93
+ - 📄 **File Export**: Save to YAML/JSON formats
94
+ - 🎛️ **Format Conversion**: Convert between different framework configs
95
+
96
+ ## 📦 Installation
97
+
98
+ ### Quick Start
99
+
100
+ ### Core Capabilities
101
+ - **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 0-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
102
+ - **Dual Interface**: Both CLI and Web UI for flexible usage
103
+ - **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, GLM, Mixtral, etc.)
104
+ - **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
105
+ - **Feasibility Analysis**: Check if your configuration fits on available GPU memory
106
+ - **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
107
+
108
+ ### Web UI Enhancements
109
+ - **Formula Explanations**: See exactly how memory is calculated with your values plugged in
110
+ - **Real-time Validation**: Client-side validation prevents invalid configurations
111
+ - **Smart Auto-calculation**: Optimized debouncing (1s) with minimum interval protection
112
+ - **Export Capabilities**: Export to DeepSpeed config files, JSON, or copy to clipboard
113
+ - **Batch Size Optimizer**: Automatically find maximum batch size that fits
114
+ - **Comparison Mode**: Save and compare different configurations side-by-side
115
+ - **Accessibility Features**: ARIA labels, keyboard navigation, colorblind-friendly charts
116
+
117
+ ### Advanced Features
118
+ - **MoE Support**: Mixture of Experts models with configurable experts and top-k routing
119
+ - **CPU/NVMe Offloading**: Offload optimizer states and parameters to CPU or NVMe storage
120
+ - **Activation Checkpointing**: 5 levels from none to full checkpointing
121
+ - **Sequence Parallelism**: Optimize memory for long sequences
122
+ - **Result Caching**: Fast repeated calculations with built-in caching
123
+
124
+ ```bash
125
+ pip install git+https://github.com/George614/gpu-mem-calculator.git
126
+ ```
127
+
128
+ ### From source
129
+
130
+ ```bash
131
+ git clone https://github.com/George614/gpu-mem_calculator.git
132
+ cd gpu_mem_calculator
133
+ pip install -e .
134
+ ```
135
+
136
+ ### For Web UI support
137
+
138
+ ```bash
139
+ pip install -e ".[web]"
140
+ ```
141
+
142
+ ### Development installation
143
+
144
+ ```bash
145
+ pip install -e ".[dev]"
146
+ ```
147
+
148
+ ## 🎓 Use Cases
149
+
150
+ ### Research & Academia
151
+ - Estimate GPU requirements for research projects before requesting compute resources
152
+ - Plan multi-GPU training configurations for large-scale experiments
153
+ - Compare memory efficiency of different training strategies
154
+
155
+ ### Industry & Production
156
+ - Cost optimization: Choose the right GPU type and count for your training workload
157
+ - Capacity planning: Forecast infrastructure needs for model development
158
+ - Debugging: Diagnose OOM errors and optimize memory usage
159
+
160
+ ### Education & Learning
161
+ - Understand how training configuration affects memory consumption
162
+ - Learn about different distributed training strategies
163
+ - Experiment with various optimization techniques safely
164
+
165
+ ## 🚀 Usage
166
+
167
+ ### Command Line Interface
168
+
169
+ #### Using model presets (Recommended)
170
+
171
+ The calculator includes pre-configured model presets for popular LLMs:
172
+
173
+ ```bash
174
+ # List all available presets
175
+ gpu-mem-calc presets
176
+
177
+ # Calculate with a preset
178
+ gpu-mem-calc calculate --preset llama2-7b
179
+ gpu-mem-calc calculate --preset mixtral-8x7b --format json
180
+
181
+ # List presets in table format
182
+ gpu-mem-calc presets --format table
183
+ ```
184
+
185
+ Available presets include:
186
+ - **Dense Models**: LLaMA 2 (7B, 13B, 70B), GPT-3 (175B)
187
+ - **MoE Models**: Mixtral 8x7B, GLM-4 (9B), GLM-4.7 (355B), GLM-4.5 Air (106B),
188
+ Qwen1.5-MoE-A2.7B, DeepSeek-MoE (16B)
189
+
190
+ #### Calculate from config file
191
+
192
+ ```bash
193
+ gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
194
+ ```
195
+
196
+ #### Quick calculation from model size
197
+
198
+ ```bash
199
+ # Calculate memory for 7B model with 8x80GB GPUs using DeepSpeed
200
+ gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
201
+
202
+ # With custom GPU memory
203
+ gpu-mem-calc quick 70 --gpus 64 --gpu-mem 80 --engine megatron
204
+ ```
205
+
206
+ #### Validate configuration
207
+
208
+ ```bash
209
+ gpu-mem-calc validate configs/my_config.json
210
+ ```
211
+
212
+ ### Web Interface
213
+
214
+ Start the web server:
215
+
216
+ ```bash
217
+ python -m gpu_mem_calculator.web.app
218
+ ```
219
+
220
+ Or using uvicorn directly:
221
+
222
+ ```bash
223
+ uvicorn gpu_mem_calculator.web.app:app --reload
224
+ ```
225
+
226
+ Then open your browser to `http://localhost:8000`
227
+
228
+ ### Python API
229
+
230
+ #### Training Memory Calculation
231
+
232
+ ```python
233
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
234
+ from gpu_mem_calculator.core.models import (
235
+ ModelConfig,
236
+ TrainingConfig,
237
+ ParallelismConfig,
238
+ EngineConfig,
239
+ GPUConfig,
240
+ )
241
+
242
+ # Create configuration
243
+ model_config = ModelConfig(
244
+ name="llama2-7b",
245
+ num_parameters=7_000_000_000,
246
+ num_layers=32,
247
+ hidden_size=4096,
248
+ num_attention_heads=32,
249
+ vocab_size=32000,
250
+ max_seq_len=4096,
251
+ )
252
+
253
+ training_config = TrainingConfig(
254
+ batch_size=4,
255
+ gradient_accumulation_steps=4,
256
+ dtype="bf16",
257
+ optimizer="adamw",
258
+ )
259
+
260
+ parallelism_config = ParallelismConfig(
261
+ data_parallel_size=8,
262
+ )
263
+
264
+ engine_config = EngineConfig(
265
+ type="deepspeed",
266
+ zero_stage=3,
267
+ offload_optimizer="cpu",
268
+ )
269
+
270
+ gpu_config = GPUConfig(
271
+ num_gpus=8,
272
+ gpu_memory_gb=80,
273
+ )
274
+
275
+ # Calculate memory
276
+ calculator = GPUMemoryCalculator(
277
+ model_config=model_config,
278
+ training_config=training_config,
279
+ parallelism_config=parallelism_config,
280
+ engine_config=engine_config,
281
+ gpu_config=gpu_config,
282
+ )
283
+
284
+ result = calculator.calculate()
285
+
286
+ print(f"Memory per GPU: {result.total_memory_per_gpu_gb:.2f} GB")
287
+ print(f"Fits on GPU: {result.fits_on_gpu}")
288
+ print(f"Utilization: {result.memory_utilization_percent:.1f}%")
289
+ ```
290
+
291
+ #### 🆕 Inference Memory Calculation
292
+
293
+ ```python
294
+ from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
295
+ from gpu_mem_calculator.core.models import (
296
+ ModelConfig,
297
+ InferenceConfig,
298
+ InferenceEngineType,
299
+ GPUConfig,
300
+ )
301
+
302
+ # Create configurations
303
+ model_config = ModelConfig(
304
+ name="llama2-7b",
305
+ num_parameters=7_000_000_000,
306
+ num_layers=32,
307
+ hidden_size=4096,
308
+ num_attention_heads=32,
309
+ max_seq_len=4096,
310
+ )
311
+
312
+ inference_config = InferenceConfig(
313
+ batch_size=32,
314
+ kv_cache_quantization="int8", # NONE, INT8, FP8, INT4
315
+ tensor_parallel_size=2,
316
+ gpu_memory_utilization=0.9,
317
+ )
318
+
319
+ gpu_config = GPUConfig(num_gpus=2, gpu_memory_gb=80)
320
+
321
+ # Calculate for different inference engines
322
+ calculator = InferenceMemoryCalculator(model_config, inference_config, gpu_config)
323
+
324
+ # vLLM inference
325
+ result_vllm = calculator.calculate(InferenceEngineType.VLLM)
326
+ print(f"vLLM: {result_vllm.total_memory_per_gpu_gb:.2f} GB")
327
+ print(f"Max batch size: {result_vllm.max_supported_batch_size}")
328
+ print(f"Throughput: {result_vllm.estimated_throughput_tokens_per_sec:.0f} tokens/sec")
329
+
330
+ # TensorRT-LLM inference
331
+ result_trt = calculator.calculate(InferenceEngineType.TENSORRT_LLM)
332
+ print(f"TensorRT-LLM: {result_trt.total_memory_per_gpu_gb:.2f} GB")
333
+ ```
334
+
335
+ #### 🆕 Multi-Node Network Overhead
336
+
337
+ ```python
338
+ from gpu_mem_calculator.core.multinode import MultiNodeCalculator
339
+ from gpu_mem_calculator.core.models import (
340
+ NodeConfig,
341
+ InterconnectType,
342
+ )
343
+
344
+ # Configure multi-node setup
345
+ node_config = NodeConfig(
346
+ num_nodes=4,
347
+ gpus_per_node=8,
348
+ interconnect_type=InterconnectType.INFINIBAND,
349
+ )
350
+
351
+ calculator = MultiNodeCalculator(
352
+ model_config=model_config,
353
+ training_config=training_config,
354
+ parallelism_config=parallelism_config,
355
+ node_config=node_config,
356
+ engine_config=engine_config,
357
+ )
358
+
359
+ # Calculate network overhead
360
+ network_overhead = calculator.calculate_network_overhead()
361
+ print(f"AllReduce: {network_overhead.allreduce_gb:.2f} GB")
362
+ print(f"AllGather: {network_overhead.allgather_gb:.2f} GB")
363
+ print(f"Time overhead: {network_overhead.estimated_overhead_ms_per_step:.2f} ms/step")
364
+
365
+ # Optimize hybrid parallelism
366
+ from gpu_mem_calculator.core.models import HybridParallelismConfig
367
+
368
+ hybrid_config = HybridParallelismConfig(
369
+ auto_optimize=True,
370
+ prefer_pipeline_parallel=True,
371
+ enable_sequence_parallel=True,
372
+ )
373
+
374
+ optimized_parallelism = calculator.optimize_hybrid_parallelism(hybrid_config)
375
+ print(f"Optimized TP: {optimized_parallelism.tensor_parallel_size}")
376
+ print(f"Optimized PP: {optimized_parallelism.pipeline_parallel_size}")
377
+ print(f"Optimized DP: {optimized_parallelism.data_parallel_size}")
378
+ ```
379
+
380
+ #### 🆕 Export Framework Configurations
381
+
382
+ ```python
383
+ from gpu_mem_calculator.exporters.manager import ExportManager, ExportFormat
384
+
385
+ # Create export manager
386
+ manager = ExportManager(
387
+ model_config=model_config,
388
+ training_config=training_config,
389
+ parallelism_config=parallelism_config,
390
+ engine_config=engine_config,
391
+ node_config=node_config,
392
+ )
393
+
394
+ # Export to different formats
395
+ accelerate_config = manager.export(ExportFormat.ACCELERATE)
396
+ lightning_config = manager.export(ExportFormat.LIGHTNING)
397
+ axolotl_config = manager.export(ExportFormat.AXOLOTL)
398
+
399
+ # Export to file
400
+ manager.export_to_file(ExportFormat.ACCELERATE, "accelerate_config.yaml")
401
+ manager.export_to_file(ExportFormat.JSON, "config.json")
402
+
403
+ # Get DeepSpeed config
404
+ deepspeed_config = manager.export(ExportFormat.DEEPSPEED)
405
+ ```
406
+
407
+ ## Configuration File Format
408
+
409
+ ```json
410
+ {
411
+ "model": {
412
+ "name": "llama2-7b",
413
+ "num_parameters": "7B",
414
+ "num_layers": 32,
415
+ "hidden_size": 4096,
416
+ "num_attention_heads": 32,
417
+ "vocab_size": 32000,
418
+ "max_seq_len": 4096
419
+ },
420
+ "training": {
421
+ "batch_size": 4,
422
+ "gradient_accumulation_steps": 4,
423
+ "optimizer": "adamw",
424
+ "dtype": "bf16",
425
+ "activation_checkpointing": 1
426
+ },
427
+ "parallelism": {
428
+ "tensor_parallel_size": 1,
429
+ "pipeline_parallel_size": 1,
430
+ "data_parallel_size": 8,
431
+ "sequence_parallel": false
432
+ },
433
+ "engine": {
434
+ "type": "deepspeed",
435
+ "zero_stage": 3,
436
+ "offload_optimizer": "cpu",
437
+ "offload_param": "none"
438
+ },
439
+ "hardware": {
440
+ "num_gpus": 8,
441
+ "gpu_memory_gb": 80
442
+ }
443
+ }
444
+ ```
445
+
446
+ ## Supported Training Engines
447
+
448
+ ### PyTorch DDP (Baseline)
449
+ Standard Distributed Data Parallel training without memory optimizations.
450
+
451
+ ### DeepSpeed ZeRO
452
+ - **ZeRO-1**: Shard optimizer states
453
+ - **ZeRO-2**: Shard optimizer states + gradients
454
+ - **ZeRO-3**: Shard everything (parameters, gradients, optimizer states)
455
+ - Supports CPU/NVMe offloading
456
+
457
+ ### Megatron-LM
458
+ Tensor and pipeline parallelism with activation checkpointing support.
459
+
460
+ ### Megatron + DeepSpeed
461
+ Combines Megatron-LM's model parallelism with DeepSpeed ZeRO's optimizer sharding.
462
+
463
+ ### PyTorch FSDP
464
+ Fully Sharded Data Parallel with multiple sharding strategies.
465
+
466
+ ## Memory Formulas
467
+
468
+ The calculator uses formulas verified against authoritative sources:
469
+
470
+ ### Base Components
471
+
472
+ **Model Parameters:**
473
+ - FP16/BF16: `num_params × 2 bytes`
474
+ - FP32: `num_params × 4 bytes`
475
+
476
+ **Gradients:**
477
+ - FP16/BF16: `num_params × 2 bytes`
478
+ - FP32: `num_params × 4 bytes`
479
+
480
+ **Optimizer States** (per optimizer type):
481
+ - **Adam/AdamW**: `num_params × 12 bytes`
482
+ - 4 bytes: FP32 parameter copy
483
+ - 4 bytes: Momentum
484
+ - 4 bytes: Variance
485
+ - **AdamW 8-bit**: `num_params × 2 bytes` (quantized)
486
+ - **SGD**: `num_params × 4 bytes` (FP32 only, no momentum)
487
+
488
+ **Activations:**
489
+ - Approximation: `batch_size × seq_len × hidden_size × num_layers × ~16 bytes/token/layer`
490
+ - Varies based on activation checkpointing level
491
+
492
+ ### DeepSpeed ZeRO Stages
493
+
494
+ **ZeRO-0** (Baseline - same as PyTorch DDP):
495
+ ```
496
+ total_per_gpu = 2×params + 2×params + 12×params + activations
497
+ = 16×params + activations
498
+ ```
499
+
500
+ **ZeRO-1** (Shard optimizer states):
501
+ ```
502
+ total_per_gpu = 2×params + 2×params + (12×params)/num_gpus + activations
503
+ ```
504
+
505
+ **ZeRO-2** (Shard optimizer + gradients):
506
+ ```
507
+ total_per_gpu = 2×params + (2×params)/num_gpus + (12×params)/num_gpus + activations
508
+ ```
509
+
510
+ **ZeRO-3** (Shard everything):
511
+ ```
512
+ total_per_gpu = largest_layer_memory + (16×params)/num_gpus + activations
513
+ where largest_layer_memory ≈ 4×(num_params/10)
514
+ ```
515
+
516
+ **CPU/NVMe Offloading:**
517
+ - Optimizer states offloaded to CPU: 0 GB GPU memory
518
+ - Parameters offloaded to CPU/NVMe: Dynamically gathered during compute
519
+
520
+ ### Verification
521
+
522
+ All formulas have been verified against:
523
+ - ✅ 18 comprehensive test scenarios (100% pass rate)
524
+ - ✅ EleutherAI Transformer Math 101
525
+ - ✅ Microsoft Research ZeRO Blog
526
+ - ✅ DeepSpeed Official Documentation
527
+ - ✅ PyTorch FSDP Documentation
528
+
529
+ ### References
530
+
531
+ - [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive transformer memory breakdown
532
+ - [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
533
+ - [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
534
+
535
+ ## Example Configurations
536
+
537
+ ### LLaMA 2 7B with DeepSpeed ZeRO-3
538
+ ```bash
539
+ gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
540
+ ```
541
+
542
+ ### GPT-3 175B with Megatron-LM
543
+ ```bash
544
+ gpu-mem-calc calculate --config configs/gpt3_175b_megatron.json
545
+ ```
546
+
547
+ ### Custom 1B model with PyTorch DDP
548
+ ```bash
549
+ gpu-mem-calc calculate --config configs/pytorch_ddp_example.json
550
+ ```
551
+
552
+ ## Web UI Features
553
+
554
+ ### Interactive Interface
555
+ - **Real-time Calculations**: Auto-calculates as you adjust parameters (1s debounce)
556
+ - **Client-side Validation**: Instant feedback on configuration errors before API calls
557
+ - **Smart Presets**: Quick-load model configurations (LLaMA 2, GPT-3, GLM, Mixtral, Qwen, DeepSeek)
558
+ - **Visual Breakdown**: Color-coded bar chart with patterns for colorblind accessibility
559
+ - **Feasibility Status**: Clear indicators showing if configuration fits on GPU
560
+
561
+ ### Formula Explanations
562
+ - **Detailed Breakdowns**: See exact formulas used with your values plugged in
563
+ - **Component-by-Component**: Each memory component explained with formula and result
564
+ - **Authoritative References**: Links to EleutherAI, Microsoft Research, DeepSpeed docs
565
+ - **Engine-Specific Details**: Different formulas for PyTorch DDP, DeepSpeed ZeRO, FSDP, Megatron-LM
566
+
567
+ ### Advanced Tools
568
+ - **Export to DeepSpeed**: Generate `deepspeed_config.json` files automatically
569
+ - **Batch Size Optimizer**: Find maximum batch size that fits your GPU memory
570
+ - **Config Persistence**: Save configurations to browser localStorage
571
+ - **Comparison Mode**: Compare different configurations side-by-side
572
+
573
+ ### Accessibility
574
+ - **ARIA Labels**: Full screen reader support throughout the interface
575
+ - **Keyboard Navigation**: All features accessible via keyboard
576
+ - **Colorblind-Friendly**: Patterns and textures supplement colors in charts
577
+ - **High Contrast**: Clear visual indicators with multiple cues
578
+
579
+ ### API Endpoints
580
+ - `POST /api/calculate` - Calculate GPU memory requirements
581
+ - `POST /api/explain-formula` - Get detailed formula explanation
582
+ - `POST /api/export/deepspeed` - Export DeepSpeed config file
583
+ - `POST /api/optimize/batch-size` - Find maximum batch size
584
+ - `GET /api/preset/{preset_name}` - Load model preset
585
+
586
+ ## Development
587
+
588
+ ### Running Tests
589
+
590
+ ```bash
591
+ pytest tests/
592
+ ```
593
+
594
+ ### Test Coverage
595
+
596
+ The calculator includes comprehensive testing:
597
+ - **Unit Tests**: Core calculation logic for each engine type
598
+ - **Integration Tests**: End-to-end configuration validation
599
+ - **Formula Verification**: 18 scenarios verifying formula accuracy
600
+ - **API Tests**: Web API endpoint testing
601
+ - **Accessibility Tests**: Screen reader and keyboard navigation
602
+
603
+ All formulas verified accurate against authoritative sources with 100% test pass rate.
604
+
605
+ ### Code Formatting
606
+
607
+ ```bash
608
+ black src/ cli/ web/
609
+ ruff check src/ cli/ web/
610
+ ```
611
+
612
+ ### Type Checking
613
+
614
+ ```bash
615
+ mypy src/
616
+ ```
617
+
618
+ ## Recent Improvements
619
+
620
+ ### Latest Updates
621
+ - ✨ Added formula explanation feature with detailed breakdowns
622
+ - ✨ Added client-side validation for better UX
623
+ - ✨ Added batch size optimizer API
624
+ - ✨ Added DeepSpeed config export functionality
625
+ - ✨ Added comprehensive input validation
626
+ - ✨ Added result caching for performance
627
+ - ♿ Added ARIA labels for full accessibility
628
+ - ♿ Added colorblind patterns to charts
629
+ - 🐛 Fixed optimizer formulas to be optimizer-specific
630
+ - 🐛 Fixed Pydantic namespace warnings
631
+
632
+ ### Verification Status
633
+ - ✅ All 18 test scenarios passing (100%)
634
+ - ✅ Formulas verified against EleutherAI, Microsoft Research, DeepSpeed docs
635
+ - ✅ Optimizer formulas corrected for AdamW, AdamW 8-bit, and SGD
636
+ - ✅ ZeRO stage formulas validated (0, 1, 2, 3)
637
+ - ✅ Engine type formulas validated (PyTorch DDP, DeepSpeed, FSDP, Megatron-LM)
638
+
639
+ ## Contributing
640
+
641
+ Contributions are welcome! Please feel free to submit a Pull Request. See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
642
+
643
+ ## 📚 References
644
+
645
+ The memory calculations in this tool are based on authoritative sources:
646
+
647
+ **Core Memory Formulas:**
648
+ - [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive breakdown of transformer memory requirements
649
+ - [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
650
+ - [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2204.13323) - Activation checkpointing strategies
651
+
652
+ **Engine Documentation:**
653
+ - [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
654
+ - [NVIDIA Megatron-LM](https://github.com/NVIDIA/Megatron-LM) - Tensor and pipeline parallelism
655
+ - [PyTorch FSDP Documentation](https://pytorch.org/docs/stable/fsdp.html) - Fully sharded data parallel
656
+ - [PyTorch DDP Tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) - Distributed data parallel
657
+
658
+ **Related Tools:**
659
+ - [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis
660
+ - [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation utilities
661
+
662
+ ## 🤝 Community & Support
663
+
664
+ - 📖 [Documentation](README.md)
665
+ - 🐛 [Issue Tracker](https://github.com/George614/gpu-mem-calculator/issues)
666
+ - 💬 [Discussions](https://github.com/George614/gpu-mem-calculator/discussions)
667
+ - 📧 Contact the maintainers via GitHub
668
+
669
+ ### Star History
670
+
671
+ If you find this tool useful, please consider giving it a star! ⭐
672
+
673
+ ## 📋 Roadmap
674
+
675
+ - [x] Inference memory calculation
676
+ - [x] Multi-node training configurations
677
+ - [x] Export to training framework configs (Accelerate, Lightning, Axolotl)
678
+ - [ ] PyPI package distribution
679
+ - [ ] Support for more model architectures (Vision Transformers, Diffusion models)
680
+ - [ ] Real-time memory monitoring dashboard
681
+ - [ ] CLI commands for inference and export features
682
+
683
+ ## 🙏 Acknowledgments
684
+
685
+ This tool was inspired by and builds upon the excellent work of:
686
+ - [DeepSpeed Memory Estimator](https://deepspeed.readthedocs.io/en/latest/memory.html) - ZeRO memory optimization formulas
687
+ - [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis methodology
688
+ - [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation approach
689
+
690
+ Special thanks to the EleutherAI community for their comprehensive [Transformer Math 101](https://blog.eleuther.ai/transformer-math/) guide, which provides detailed formulas for transformer memory calculations.
691
+
692
+ ## 📄 License
693
+
694
+ MIT License - see [LICENSE](LICENSE) for details.
695
+
696
+ ## 📚 Citation
697
+
698
+ If you use this tool in your research, please cite:
699
+
700
+ ```bibtex
701
+ @software{gpu_mem_calculator,
702
+ title = {GPU Memory Calculator for LLM Training},
703
+ author = {GPU Mem Calculator Team},
704
+ year = {2024},
705
+ url = {https://github.com/George614/gpu-mem-calculator}
706
+ }
707
+ ```
708
+
709
+ ---
710
+
711
+ <p align="center">
712
+ Made with ❤️ for the ML community
713
+ </p>
714
+
715
+ <p align="center">
716
+ <a href="https://github.com/George614/gpu-mem-calculator/stargazers">⭐ Star us on GitHub</a> •
717
+ <a href="https://github.com/George614/gpu-mem-calculator/issues">🐛 Report a Bug</a> •
718
+ <a href="https://github.com/George614/gpu-mem-calculator/issues">💡 Request a Feature</a>
719
+ </p>
720
+
src/gpu_mem_calculator.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/gpu_mem_calculator/__init__.py
5
+ src/gpu_mem_calculator/py.typed
6
+ src/gpu_mem_calculator.egg-info/PKG-INFO
7
+ src/gpu_mem_calculator.egg-info/SOURCES.txt
8
+ src/gpu_mem_calculator.egg-info/dependency_links.txt
9
+ src/gpu_mem_calculator.egg-info/entry_points.txt
10
+ src/gpu_mem_calculator.egg-info/requires.txt
11
+ src/gpu_mem_calculator.egg-info/top_level.txt
12
+ src/gpu_mem_calculator/cli/__init__.py
13
+ src/gpu_mem_calculator/cli/main.py
14
+ src/gpu_mem_calculator/config/__init__.py
15
+ src/gpu_mem_calculator/config/parser.py
16
+ src/gpu_mem_calculator/config/presets.py
17
+ src/gpu_mem_calculator/core/__init__.py
18
+ src/gpu_mem_calculator/core/calculator.py
19
+ src/gpu_mem_calculator/core/formulas.py
20
+ src/gpu_mem_calculator/core/models.py
21
+ src/gpu_mem_calculator/core/multinode.py
22
+ src/gpu_mem_calculator/engines/__init__.py
23
+ src/gpu_mem_calculator/engines/base.py
24
+ src/gpu_mem_calculator/engines/deepspeed.py
25
+ src/gpu_mem_calculator/engines/fsdp.py
26
+ src/gpu_mem_calculator/engines/megatron.py
27
+ src/gpu_mem_calculator/engines/pytorch.py
28
+ src/gpu_mem_calculator/exporters/__init__.py
29
+ src/gpu_mem_calculator/exporters/accelerate.py
30
+ src/gpu_mem_calculator/exporters/axolotl.py
31
+ src/gpu_mem_calculator/exporters/lightning.py
32
+ src/gpu_mem_calculator/exporters/manager.py
33
+ src/gpu_mem_calculator/inference/__init__.py
34
+ src/gpu_mem_calculator/inference/base.py
35
+ src/gpu_mem_calculator/inference/calculator.py
36
+ src/gpu_mem_calculator/inference/huggingface.py
37
+ src/gpu_mem_calculator/inference/tensorrt_llm.py
38
+ src/gpu_mem_calculator/inference/tgi.py
39
+ src/gpu_mem_calculator/inference/vllm.py
40
+ src/gpu_mem_calculator/utils/__init__.py
41
+ src/gpu_mem_calculator/utils/precision.py
42
+ tests/test_calculator.py
43
+ tests/test_comprehensive.py
44
+ tests/test_exporters.py
45
+ tests/test_inference.py
46
+ tests/test_multinode.py
src/gpu_mem_calculator.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/gpu_mem_calculator.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ gpu-mem-calc = gpu_mem_calculator.cli:main
src/gpu_mem_calculator.egg-info/requires.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pydantic>=2.0.0
2
+ click>=8.1.0
3
+ pydantic-settings>=2.0.0
4
+ rich>=13.0.0
5
+
6
+ [dev]
7
+ pytest>=7.0.0
8
+ pytest-cov>=4.0.0
9
+ black>=23.0.0
10
+ ruff>=0.1.0
11
+ mypy>=1.5.0
12
+
13
+ [web]
14
+ fastapi>=0.100.0
15
+ uvicorn[standard]>=0.23.0
16
+ jinja2>=3.1.0
src/gpu_mem_calculator.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gpu_mem_calculator
src/gpu_mem_calculator/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """GPU Memory Calculator for LLM Training."""
2
+
3
+ __version__ = "0.1.0"
src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (257 Bytes). View file
 
src/gpu_mem_calculator/cli/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """CLI interface for GPU Memory Calculator."""
2
+
3
+ from gpu_mem_calculator.cli.main import main
4
+
5
+ __all__ = ["main"]
src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (322 Bytes). View file
 
src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc ADDED
Binary file (14.3 kB). View file
 
src/gpu_mem_calculator/cli/main.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI interface for GPU Memory Calculator."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ import click
9
+
10
+ if TYPE_CHECKING:
11
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
12
+ from gpu_mem_calculator.core.models import MemoryResult
13
+
14
+
15
+ @click.group()
16
+ @click.version_option(version="0.1.0")
17
+ def main() -> None:
18
+ """GPU Memory Calculator for LLM Training.
19
+
20
+ Calculate GPU memory requirements for training Large Language Models
21
+ with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
22
+ """
23
+ pass
24
+
25
+
26
+ @main.command()
27
+ @click.option(
28
+ "--config",
29
+ "-c",
30
+ type=click.Path(exists=True),
31
+ help="Path to JSON configuration file",
32
+ )
33
+ @click.option(
34
+ "--preset",
35
+ "-p",
36
+ type=str,
37
+ help="Name of a preset model configuration",
38
+ )
39
+ @click.option(
40
+ "--output",
41
+ "-o",
42
+ type=click.Path(),
43
+ help="Output file path (default: stdout)",
44
+ )
45
+ @click.option(
46
+ "--format",
47
+ "-f",
48
+ type=click.Choice(["json", "yaml", "table"]),
49
+ default="table",
50
+ help="Output format (default: table)",
51
+ )
52
+ def calculate(
53
+ config: str | None,
54
+ preset: str | None,
55
+ output: str | None,
56
+ format: Literal["json", "yaml", "table"],
57
+ ) -> None:
58
+ """Calculate GPU memory requirements from config file or preset.
59
+
60
+ Examples:
61
+ gpu-mem-calc calculate --config configs/llama2_7b.json
62
+ gpu-mem-calc calculate --preset llama2-7b
63
+ gpu-mem-calc calculate -p mixtral-8x7b --format json
64
+ """
65
+ if not config and not preset:
66
+ click.echo("Error: Either --config or --preset is required", err=True)
67
+ sys.exit(1)
68
+
69
+ if config and preset:
70
+ click.echo("Error: Cannot use both --config and --preset", err=True)
71
+ sys.exit(1)
72
+
73
+ try:
74
+ import tempfile
75
+
76
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
77
+
78
+ if preset:
79
+ # Load preset configuration
80
+ from gpu_mem_calculator.config.presets import get_preset_config
81
+
82
+ preset_config = get_preset_config(preset)
83
+ if preset_config is None:
84
+ click.echo(
85
+ f"Error: Preset '{preset}' not found. "
86
+ "Use 'gpu-mem-calc presets' to list available presets.",
87
+ err=True,
88
+ )
89
+ sys.exit(1)
90
+
91
+ # Write preset to temp file for from_config_file
92
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
93
+ json.dump(preset_config, f, indent=2)
94
+ temp_path = f.name
95
+
96
+ calculator = GPUMemoryCalculator.from_config_file(temp_path)
97
+ Path(temp_path).unlink() # Clean up temp file
98
+ elif config:
99
+ calculator = GPUMemoryCalculator.from_config_file(config)
100
+ else:
101
+ # This should never happen due to the checks above
102
+ click.echo("Error: Either --config or --preset is required", err=True)
103
+ sys.exit(1)
104
+
105
+ result = calculator.calculate()
106
+
107
+ # Format output
108
+ if format == "json":
109
+ output_text = json.dumps(result.model_dump(mode="json"), indent=2)
110
+ elif format == "yaml":
111
+ try:
112
+ import yaml # type: ignore[import-untyped]
113
+
114
+ output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
115
+ except ImportError:
116
+ click.echo(
117
+ "Error: YAML format requires PyYAML. Install with: pip install pyyaml",
118
+ err=True,
119
+ )
120
+ sys.exit(1)
121
+ else: # table
122
+ output_text = _format_result_as_table(result, calculator)
123
+
124
+ # Write output
125
+ if output:
126
+ Path(output).write_text(output_text)
127
+ click.echo(f"Results written to {output}")
128
+ else:
129
+ click.echo(output_text)
130
+
131
+ except Exception as e:
132
+ click.echo(f"Error: {e}", err=True)
133
+ sys.exit(1)
134
+
135
+
136
+ @main.command()
137
+ @click.argument(
138
+ "params",
139
+ type=float,
140
+ required=True,
141
+ )
142
+ @click.option(
143
+ "--gpus",
144
+ "-g",
145
+ type=int,
146
+ default=1,
147
+ help="Number of GPUs (default: 1)",
148
+ )
149
+ @click.option(
150
+ "--gpu-mem",
151
+ "-m",
152
+ type=float,
153
+ default=80.0,
154
+ help="GPU memory in GB (default: 80.0)",
155
+ )
156
+ @click.option(
157
+ "--engine",
158
+ "-e",
159
+ type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
160
+ default="pytorch",
161
+ help="Training engine (default: pytorch)",
162
+ )
163
+ @click.option(
164
+ "--dtype",
165
+ "-d",
166
+ type=click.Choice(["fp32", "fp16", "bf16"]),
167
+ default="bf16",
168
+ help="Data type (default: bf16)",
169
+ )
170
+ def quick(
171
+ params: float,
172
+ gpus: int,
173
+ gpu_mem: float,
174
+ engine: str,
175
+ dtype: str,
176
+ ) -> None:
177
+ """Quick calculation from model size (in billions of parameters).
178
+
179
+ Example:
180
+ gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
181
+ """
182
+ try:
183
+ from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
184
+ from gpu_mem_calculator.core.models import (
185
+ DType,
186
+ EngineConfig,
187
+ EngineType,
188
+ GPUConfig,
189
+ ModelConfig,
190
+ ParallelismConfig,
191
+ TrainingConfig,
192
+ )
193
+
194
+ # Map engine string to EngineType
195
+ engine_map = {
196
+ "pytorch": EngineType.PYTORCH_DDP,
197
+ "deepspeed": EngineType.DEEPSPEED,
198
+ "megatron": EngineType.MEGATRON_LM,
199
+ "fsdp": EngineType.FSDP,
200
+ }
201
+
202
+ # Map dtype string to DType
203
+ dtype_map = {
204
+ "fp32": DType.FP32,
205
+ "fp16": DType.FP16,
206
+ "bf16": DType.BF16,
207
+ }
208
+
209
+ # Create a minimal config for quick calculation
210
+ # Estimate model architecture from parameter count
211
+ # Rough approximation based on typical transformer models
212
+ num_params = int(params * 1e9)
213
+
214
+ # Estimate hidden size and layers from param count
215
+ # These are rough approximations
216
+ if params <= 1:
217
+ hidden_size, num_layers = 768, 12
218
+ elif params <= 7:
219
+ hidden_size, num_layers = 4096, 32
220
+ elif params <= 13:
221
+ hidden_size, num_layers = 5120, 40
222
+ elif params <= 30:
223
+ hidden_size, num_layers = 6656, 60
224
+ elif params <= 65:
225
+ hidden_size, num_layers = 8192, 80
226
+ else:
227
+ hidden_size, num_layers = 12288, 96
228
+
229
+ model_config = ModelConfig(
230
+ name="quick-estimate",
231
+ num_parameters=num_params,
232
+ num_layers=num_layers,
233
+ hidden_size=hidden_size,
234
+ num_attention_heads=hidden_size // 128,
235
+ vocab_size=32000,
236
+ max_seq_len=2048,
237
+ )
238
+
239
+ training_config = TrainingConfig(
240
+ batch_size=1,
241
+ gradient_accumulation_steps=1,
242
+ dtype=dtype_map[dtype],
243
+ )
244
+
245
+ parallelism_config = ParallelismConfig(data_parallel_size=gpus)
246
+
247
+ engine_config = EngineConfig(
248
+ type=engine_map[engine],
249
+ zero_stage=2 if engine == "deepspeed" else None,
250
+ )
251
+
252
+ gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
253
+
254
+ calculator = GPUMemoryCalculator(
255
+ model_config=model_config,
256
+ training_config=training_config,
257
+ parallelism_config=parallelism_config,
258
+ engine_config=engine_config,
259
+ gpu_config=gpu_config,
260
+ )
261
+
262
+ result = calculator.calculate()
263
+
264
+ # Display results
265
+ click.echo(_format_result_as_table(result, calculator))
266
+
267
+ except Exception as e:
268
+ click.echo(f"Error: {e}", err=True)
269
+ sys.exit(1)
270
+
271
+
272
+ @main.command()
273
+ @click.argument(
274
+ "config_path",
275
+ type=click.Path(exists=True),
276
+ )
277
+ def validate(config_path: str) -> None:
278
+ """Validate a configuration file.
279
+
280
+ Example:
281
+ gpu-mem-calc validate configs/my_config.json
282
+ """
283
+ try:
284
+ from gpu_mem_calculator.config import ConfigParser
285
+
286
+ ConfigParser.parse_full_config(config_path)
287
+ click.echo(f"✓ Configuration file '{config_path}' is valid")
288
+
289
+ except Exception as e:
290
+ click.echo(f"✗ Validation failed: {e}", err=True)
291
+ sys.exit(1)
292
+
293
+
294
+ @main.command()
295
+ @click.option(
296
+ "--format",
297
+ "-f",
298
+ type=click.Choice(["list", "json", "table"]),
299
+ default="list",
300
+ help="Output format (default: list)",
301
+ )
302
+ def presets(format: str) -> None:
303
+ """List available model preset configurations.
304
+
305
+ Examples:
306
+ gpu-mem-calc presets
307
+ gpu-mem-calc presets --format table
308
+ gpu-mem-calc presets -f json
309
+ """
310
+ try:
311
+ from gpu_mem_calculator.config.presets import list_presets
312
+
313
+ all_presets = list_presets()
314
+
315
+ if not all_presets:
316
+ click.echo("No presets found.")
317
+ return
318
+
319
+ if format == "json":
320
+ click.echo(json.dumps(all_presets, indent=2))
321
+ elif format == "table":
322
+ from rich.console import Console
323
+ from rich.table import Table
324
+
325
+ console = Console()
326
+ table = Table(
327
+ title="Available Model Presets",
328
+ show_header=True,
329
+ header_style="bold magenta",
330
+ )
331
+ table.add_column("Preset Name", style="cyan", width=25)
332
+ table.add_column("Display Name", style="green", width=30)
333
+ table.add_column("Description", style="yellow")
334
+
335
+ for name, info in sorted(all_presets.items()):
336
+ table.add_row(name, info["display_name"], info["description"])
337
+
338
+ console.print(table)
339
+ else: # list format
340
+ click.echo("Available model presets:\n")
341
+ for name, info in sorted(all_presets.items()): # type: ignore[annotation-unchecked]
342
+ click.echo(f" {name:25} - {info['display_name']}")
343
+ if info.get("description"):
344
+ click.echo(f"{'':27}{info['description']}")
345
+ click.echo()
346
+
347
+ except Exception as e:
348
+ click.echo(f"Error: {e}", err=True)
349
+ sys.exit(1)
350
+
351
+
352
+ def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
353
+ """Format result as ASCII table."""
354
+ from rich.console import Console
355
+ from rich.table import Table
356
+
357
+ console = Console()
358
+
359
+ # Main results table
360
+ table = Table(
361
+ title="GPU Memory Calculation Results",
362
+ show_header=True,
363
+ header_style="bold magenta",
364
+ )
365
+ table.add_column("Metric", style="cyan", width=30)
366
+ table.add_column("Value", style="green")
367
+
368
+ # Memory results
369
+ table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
370
+ table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
371
+ table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
372
+ table.add_row("", "") # Spacer
373
+
374
+ # Breakdown
375
+ table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
376
+ table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
377
+ table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
378
+ table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
379
+ table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
380
+ table.add_row("", "") # Spacer
381
+
382
+ # Feasibility
383
+ status = "✓ Fits" if result.fits_on_gpu else "✗ OOM"
384
+ table.add_row("Status", status)
385
+ table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
386
+ if result.recommended_batch_size:
387
+ table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
388
+
389
+ # Capture table output
390
+ from io import StringIO
391
+
392
+ buffer = StringIO()
393
+ console.file = buffer
394
+ console.print(table)
395
+ return buffer.getvalue()
396
+
397
+
398
+ if __name__ == "__main__":
399
+ main()
src/gpu_mem_calculator/config/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Configuration parsing and defaults."""
2
+
3
+ from gpu_mem_calculator.config.parser import ConfigParser, load_config, save_config
4
+
5
+ __all__ = ["ConfigParser", "load_config", "save_config"]
src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (375 Bytes). View file
 
src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc ADDED
Binary file (14.2 kB). View file
 
src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc ADDED
Binary file (3.35 kB). View file
 
src/gpu_mem_calculator/config/parser.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration file parser and utilities."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, cast
6
+
7
+ from pydantic import ValidationError
8
+
9
+ from gpu_mem_calculator.core.models import (
10
+ DType,
11
+ EngineConfig,
12
+ EngineType,
13
+ GPUConfig,
14
+ ModelConfig,
15
+ OffloadDevice,
16
+ OptimizerType,
17
+ ParallelismConfig,
18
+ TrainingConfig,
19
+ )
20
+
21
+
22
+ class ConfigParseError(Exception):
23
+ """Error parsing configuration file."""
24
+
25
+ def __init__(self, message: str, errors: list[Any] | None = None):
26
+ super().__init__(message)
27
+ self.errors = errors or []
28
+
29
+
30
+ class ConfigParser:
31
+ """Parse and validate configuration files."""
32
+
33
+ @staticmethod
34
+ def _convert_dtype(value: str) -> DType:
35
+ """Convert string dtype to DType enum."""
36
+ dtype_map = {
37
+ "float32": DType.FP32,
38
+ "fp32": DType.FP32,
39
+ "float16": DType.FP16,
40
+ "fp16": DType.FP16,
41
+ "bfloat16": DType.BF16,
42
+ "bf16": DType.BF16,
43
+ "int8": DType.INT8,
44
+ "int4": DType.INT4,
45
+ }
46
+ return dtype_map.get(value.lower(), DType.BF16)
47
+
48
+ @staticmethod
49
+ def _convert_optimizer(value: str) -> OptimizerType:
50
+ """Convert string optimizer to OptimizerType enum."""
51
+ opt_map = {
52
+ "adam": OptimizerType.ADAM,
53
+ "adamw": OptimizerType.ADAMW,
54
+ "sgd": OptimizerType.SGD,
55
+ "adamw_8bit": OptimizerType.ADAMW_8BIT,
56
+ "adamw-8bit": OptimizerType.ADAMW_8BIT,
57
+ }
58
+ return opt_map.get(value.lower(), OptimizerType.ADAMW)
59
+
60
+ @staticmethod
61
+ def _convert_engine(value: str) -> EngineType:
62
+ """Convert string engine to EngineType enum."""
63
+ engine_map = {
64
+ "pytorch": EngineType.PYTORCH_DDP,
65
+ "pytorch_ddp": EngineType.PYTORCH_DDP,
66
+ "ddp": EngineType.PYTORCH_DDP,
67
+ "deepspeed": EngineType.DEEPSPEED,
68
+ "megatron": EngineType.MEGATRON_LM,
69
+ "megatron_lm": EngineType.MEGATRON_LM,
70
+ "megatron-lm": EngineType.MEGATRON_LM,
71
+ "fsdp": EngineType.FSDP,
72
+ "megatron_deepspeed": EngineType.MEGATRON_DEEPSPEED,
73
+ }
74
+ return engine_map.get(value.lower(), EngineType.PYTORCH_DDP)
75
+
76
+ @staticmethod
77
+ def _convert_offload(value: str) -> OffloadDevice:
78
+ """Convert string offload to OffloadDevice enum."""
79
+ offload_map = {
80
+ "none": OffloadDevice.NONE,
81
+ "cpu": OffloadDevice.CPU,
82
+ "nvme": OffloadDevice.NVME,
83
+ }
84
+ return offload_map.get(value.lower(), OffloadDevice.NONE)
85
+
86
+ @staticmethod
87
+ def _parse_num_params(value: str | int | float) -> int:
88
+ """Parse number of parameters from various formats.
89
+
90
+ Supports:
91
+ - Raw integer: 7000000000
92
+ - Billions: "7B", "7b", "7e9"
93
+ - Millions: "7000M", "7000m", "7000e6"
94
+ """
95
+ if isinstance(value, int):
96
+ return value
97
+ if isinstance(value, float):
98
+ return int(value)
99
+
100
+ if isinstance(value, str):
101
+ value = value.strip().upper()
102
+
103
+ # Handle billions suffix
104
+ if value.endswith("B"):
105
+ return int(float(value[:-1]) * 1_000_000_000)
106
+
107
+ # Handle millions suffix
108
+ if value.endswith("M"):
109
+ return int(float(value[:-1]) * 1_000_000)
110
+
111
+ # Handle scientific notation
112
+ if "E" in value:
113
+ return int(float(value))
114
+
115
+ # Try direct conversion
116
+ return int(value)
117
+
118
+ raise ValueError(f"Cannot parse parameter count: {value}")
119
+
120
+ @classmethod
121
+ def parse_model_config(cls, data: dict[str, Any]) -> ModelConfig:
122
+ """Parse model configuration from dict.
123
+
124
+ Args:
125
+ data: Dictionary with model configuration
126
+
127
+ Returns:
128
+ ModelConfig object
129
+
130
+ Raises:
131
+ ConfigParseError: If validation fails
132
+ """
133
+ try:
134
+ # Convert parameter count if it's a string
135
+ if "num_parameters" in data and isinstance(data["num_parameters"], str):
136
+ data["num_parameters"] = cls._parse_num_params(data["num_parameters"])
137
+
138
+ if "largest_layer_params" in data and isinstance(data["largest_layer_params"], str):
139
+ data["largest_layer_params"] = cls._parse_num_params(data["largest_layer_params"])
140
+
141
+ return ModelConfig(**data)
142
+ except ValidationError as e:
143
+ raise ConfigParseError("Invalid model configuration", e.errors()) from e
144
+
145
+ @classmethod
146
+ def parse_training_config(cls, data: dict[str, Any]) -> TrainingConfig:
147
+ """Parse training configuration from dict.
148
+
149
+ Args:
150
+ data: Dictionary with training configuration
151
+
152
+ Returns:
153
+ TrainingConfig object
154
+
155
+ Raises:
156
+ ConfigParseError: If validation fails
157
+ """
158
+ try:
159
+ # Convert dtype
160
+ if "dtype" in data and isinstance(data["dtype"], str):
161
+ data["dtype"] = cls._convert_dtype(data["dtype"])
162
+
163
+ # Convert optimizer
164
+ if "optimizer" in data and isinstance(data["optimizer"], str):
165
+ data["optimizer"] = cls._convert_optimizer(data["optimizer"])
166
+
167
+ return TrainingConfig(**data)
168
+ except ValidationError as e:
169
+ raise ConfigParseError("Invalid training configuration", e.errors()) from e
170
+
171
+ @classmethod
172
+ def parse_parallelism_config(cls, data: dict[str, Any]) -> ParallelismConfig:
173
+ """Parse parallelism configuration from dict.
174
+
175
+ Args:
176
+ data: Dictionary with parallelism configuration
177
+
178
+ Returns:
179
+ ParallelismConfig object
180
+
181
+ Raises:
182
+ ConfigParseError: If validation fails
183
+ """
184
+ try:
185
+ return ParallelismConfig(**data)
186
+ except ValidationError as e:
187
+ raise ConfigParseError("Invalid parallelism configuration", e.errors()) from e
188
+
189
+ @classmethod
190
+ def parse_engine_config(cls, data: dict[str, Any]) -> EngineConfig:
191
+ """Parse engine configuration from dict.
192
+
193
+ Args:
194
+ data: Dictionary with engine configuration
195
+
196
+ Returns:
197
+ EngineConfig object
198
+
199
+ Raises:
200
+ ConfigParseError: If validation fails
201
+ """
202
+ try:
203
+ # Convert engine type
204
+ if "type" in data and isinstance(data["type"], str):
205
+ data["type"] = cls._convert_engine(data["type"])
206
+
207
+ # Convert offload options
208
+ if "offload_optimizer" in data and isinstance(data["offload_optimizer"], str):
209
+ data["offload_optimizer"] = cls._convert_offload(data["offload_optimizer"])
210
+
211
+ if "offload_param" in data and isinstance(data["offload_param"], str):
212
+ data["offload_param"] = cls._convert_offload(data["offload_param"])
213
+
214
+ return EngineConfig(**data)
215
+ except ValidationError as e:
216
+ raise ConfigParseError("Invalid engine configuration", e.errors()) from e
217
+
218
+ @classmethod
219
+ def parse_gpu_config(cls, data: dict[str, Any]) -> GPUConfig:
220
+ """Parse GPU configuration from dict.
221
+
222
+ Args:
223
+ data: Dictionary with GPU configuration
224
+
225
+ Returns:
226
+ GPUConfig object
227
+
228
+ Raises:
229
+ ConfigParseError: If validation fails
230
+ """
231
+ try:
232
+ return GPUConfig(**data)
233
+ except ValidationError as e:
234
+ raise ConfigParseError("Invalid GPU configuration", e.errors()) from e
235
+
236
+ @classmethod
237
+ def parse_file(cls, config_path: str | Path) -> dict[str, Any]:
238
+ """Parse configuration from JSON file.
239
+
240
+ Args:
241
+ config_path: Path to configuration file
242
+
243
+ Returns:
244
+ Dictionary with parsed configuration
245
+
246
+ Raises:
247
+ ConfigParseError: If file cannot be read or parsed
248
+ """
249
+ path = Path(config_path)
250
+ if not path.exists():
251
+ raise ConfigParseError(f"Configuration file not found: {config_path}")
252
+
253
+ try:
254
+ with path.open("r") as f:
255
+ data = cast(dict[str, Any], json.load(f))
256
+ return data
257
+ except json.JSONDecodeError as e:
258
+ raise ConfigParseError(f"Invalid JSON in configuration file: {e}") from e
259
+ except Exception as e:
260
+ raise ConfigParseError(f"Error reading configuration file: {e}") from e
261
+
262
+ @classmethod
263
+ def parse_full_config(
264
+ cls,
265
+ config_path: str | Path,
266
+ ) -> tuple[ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig]:
267
+ """Parse complete configuration from file.
268
+
269
+ Args:
270
+ config_path: Path to configuration file
271
+
272
+ Returns:
273
+ Tuple of (ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig)
274
+
275
+ Raises:
276
+ ConfigParseError: If validation fails
277
+ """
278
+ data = cls.parse_file(config_path)
279
+
280
+ try:
281
+ model_config = cls.parse_model_config(data.get("model", {}))
282
+ training_config = cls.parse_training_config(data.get("training", {}))
283
+ parallelism_config = cls.parse_parallelism_config(data.get("parallelism", {}))
284
+ engine_config = cls.parse_engine_config(data.get("engine", {}))
285
+ gpu_config = cls.parse_gpu_config(data.get("hardware", {}))
286
+
287
+ return (
288
+ model_config,
289
+ training_config,
290
+ parallelism_config,
291
+ engine_config,
292
+ gpu_config,
293
+ )
294
+ except ConfigParseError:
295
+ raise
296
+ except Exception as e:
297
+ raise ConfigParseError(f"Unexpected error parsing configuration: {e}") from e
298
+
299
+
300
+ def load_config(config_path: str | Path) -> dict[str, Any]:
301
+ """Load configuration from file.
302
+
303
+ Args:
304
+ config_path: Path to configuration file
305
+
306
+ Returns:
307
+ Dictionary with configuration data
308
+ """
309
+ return ConfigParser.parse_file(config_path)
310
+
311
+
312
+ def save_config(data: dict[str, Any], output_path: str | Path) -> None:
313
+ """Save configuration to JSON file.
314
+
315
+ Args:
316
+ data: Configuration dictionary to save
317
+ output_path: Path to save configuration file
318
+ """
319
+ path = Path(output_path)
320
+ path.parent.mkdir(parents=True, exist_ok=True)
321
+
322
+ with path.open("w") as f:
323
+ json.dump(data, f, indent=2)
src/gpu_mem_calculator/config/presets.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Preset model configurations loader.
2
+
3
+ This module provides a centralized location for managing model preset
4
+ configurations that can be used by both CLI and web interfaces.
5
+ """
6
+
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Any, cast
10
+
11
+ # Base directory for the package
12
+ BASE_DIR = Path(__file__).parent.parent.parent.parent
13
+
14
+
15
+ def get_presets_file_path() -> Path:
16
+ """Get the path to the presets JSON file.
17
+
18
+ Returns:
19
+ Path to the presets JSON file
20
+ """
21
+ # Check for web/presets/models.json relative to project root
22
+ presets_path = BASE_DIR / "web" / "presets" / "models.json"
23
+ if presets_path.exists():
24
+ return presets_path
25
+
26
+ # Fallback to src directory for development installs
27
+ presets_path = BASE_DIR / "src" / "gpu_mem_calculator" / "presets" / "models.json"
28
+ return presets_path
29
+
30
+
31
+ def load_presets() -> dict[str, dict[str, Any]]:
32
+ """Load all preset model configurations.
33
+
34
+ Returns:
35
+ Dictionary mapping preset names to their configurations.
36
+ Each preset has: display_name, description, config
37
+ """
38
+ presets_file = get_presets_file_path()
39
+
40
+ if not presets_file.exists():
41
+ return {}
42
+
43
+ try:
44
+ with presets_file.open("r") as f:
45
+ return cast(dict[str, dict[str, Any]], json.load(f))
46
+ except (json.JSONDecodeError, OSError):
47
+ return {}
48
+
49
+
50
+ def get_preset_config(preset_name: str) -> dict[str, Any] | None:
51
+ """Get a specific preset configuration.
52
+
53
+ Args:
54
+ preset_name: Name of the preset to retrieve
55
+
56
+ Returns:
57
+ Preset configuration dict, or None if not found
58
+ """
59
+ presets = load_presets()
60
+ preset = presets.get(preset_name)
61
+
62
+ if preset is None:
63
+ return None
64
+
65
+ # Return just the config part (what the calculator needs)
66
+ return cast(dict[str, Any], preset.get("config", {}))
67
+
68
+
69
+ def list_presets() -> dict[str, dict[str, str]]:
70
+ """List all available presets with metadata.
71
+
72
+ Returns:
73
+ Dictionary mapping preset names to their display metadata.
74
+ Each entry has: display_name, description
75
+ """
76
+ presets = load_presets()
77
+ return {
78
+ name: {
79
+ "display_name": preset.get("display_name", name),
80
+ "description": preset.get("description", ""),
81
+ }
82
+ for name, preset in presets.items()
83
+ }
src/gpu_mem_calculator/core/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core memory calculation models and formulas."""
2
+
3
+ from gpu_mem_calculator.core.formulas import Precision
4
+ from gpu_mem_calculator.core.models import (
5
+ EngineConfig,
6
+ EngineType,
7
+ GPUConfig,
8
+ ModelConfig,
9
+ ParallelismConfig,
10
+ TrainingConfig,
11
+ )
12
+
13
+ __all__ = [
14
+ "ModelConfig",
15
+ "TrainingConfig",
16
+ "ParallelismConfig",
17
+ "EngineConfig",
18
+ "EngineType",
19
+ "GPUConfig",
20
+ "Precision",
21
+ ]
22
+
23
+ # Import GPUMemoryCalculator separately to avoid circular import
24
+ # Use: from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (562 Bytes). View file
 
src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc ADDED
Binary file (6.51 kB). View file
 
src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc ADDED
Binary file (7.29 kB). View file
 
src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc ADDED
Binary file (24.4 kB). View file
 
src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc ADDED
Binary file (10.8 kB). View file
 
src/gpu_mem_calculator/core/calculator.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main GPU memory calculator.
2
+
3
+ Orchestrates the memory calculation by selecting the appropriate
4
+ training engine and aggregating results.
5
+ """
6
+
7
+ from gpu_mem_calculator.config.parser import ConfigParser
8
+ from gpu_mem_calculator.core.models import (
9
+ EngineConfig,
10
+ EngineType,
11
+ GPUConfig,
12
+ MemoryResult,
13
+ ModelConfig,
14
+ NodeConfig,
15
+ ParallelismConfig,
16
+ TrainingConfig,
17
+ )
18
+ from gpu_mem_calculator.engines import (
19
+ DeepSpeedEngine,
20
+ FSDPEngine,
21
+ MegatronDeepSpeedEngine,
22
+ MegatronLMEngine,
23
+ PyTorchDDPEngine,
24
+ )
25
+
26
+ # Type alias for engine types
27
+ EngineTypeAlias = (
28
+ PyTorchDDPEngine | DeepSpeedEngine | MegatronLMEngine | FSDPEngine | MegatronDeepSpeedEngine
29
+ )
30
+
31
+
32
+ class GPUMemoryCalculator:
33
+ """Main GPU memory calculator.
34
+
35
+ This class provides a high-level interface for calculating
36
+ GPU memory requirements for LLM training.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ model_config: ModelConfig,
42
+ training_config: TrainingConfig,
43
+ parallelism_config: ParallelismConfig | None = None,
44
+ engine_config: EngineConfig | None = None,
45
+ gpu_config: GPUConfig | None = None,
46
+ node_config: NodeConfig | None = None,
47
+ ) -> None:
48
+ """Initialize the calculator.
49
+
50
+ Args:
51
+ model_config: Model architecture configuration
52
+ training_config: Training hyperparameters
53
+ parallelism_config: Parallelism settings (default: no parallelism)
54
+ engine_config: Training engine configuration (default: PyTorch DDP)
55
+ gpu_config: Hardware configuration (default: 1x 80GB GPU)
56
+ node_config: Multi-node configuration (default: single node)
57
+ """
58
+ self.model_config = model_config
59
+ self.training_config = training_config
60
+ self.parallelism_config = parallelism_config or ParallelismConfig()
61
+ self.engine_config = engine_config or EngineConfig()
62
+ self.gpu_config = gpu_config or GPUConfig()
63
+ self.node_config = node_config or NodeConfig()
64
+
65
+ def calculate(self) -> MemoryResult:
66
+ """Calculate GPU memory requirements.
67
+
68
+ Selects the appropriate training engine based on configuration
69
+ and returns the memory calculation result.
70
+
71
+ Returns:
72
+ MemoryResult with complete memory breakdown
73
+ """
74
+ engine = self._get_engine()
75
+ return engine.calculate_memory()
76
+
77
+ def _get_engine(self) -> EngineTypeAlias:
78
+ """Get the appropriate training engine instance.
79
+
80
+ Returns:
81
+ Engine instance configured with current settings
82
+ """
83
+ match self.engine_config.type:
84
+ case EngineType.PYTORCH_DDP:
85
+ return PyTorchDDPEngine(
86
+ model_config=self.model_config,
87
+ training_config=self.training_config,
88
+ parallelism_config=self.parallelism_config,
89
+ engine_config=self.engine_config,
90
+ gpu_config=self.gpu_config,
91
+ node_config=self.node_config,
92
+ )
93
+ case EngineType.DEEPSPEED:
94
+ return DeepSpeedEngine(
95
+ model_config=self.model_config,
96
+ training_config=self.training_config,
97
+ parallelism_config=self.parallelism_config,
98
+ engine_config=self.engine_config,
99
+ gpu_config=self.gpu_config,
100
+ node_config=self.node_config,
101
+ )
102
+ case EngineType.MEGATRON_LM:
103
+ return MegatronLMEngine(
104
+ model_config=self.model_config,
105
+ training_config=self.training_config,
106
+ parallelism_config=self.parallelism_config,
107
+ engine_config=self.engine_config,
108
+ gpu_config=self.gpu_config,
109
+ node_config=self.node_config,
110
+ )
111
+ case EngineType.FSDP:
112
+ return FSDPEngine(
113
+ model_config=self.model_config,
114
+ training_config=self.training_config,
115
+ parallelism_config=self.parallelism_config,
116
+ engine_config=self.engine_config,
117
+ gpu_config=self.gpu_config,
118
+ node_config=self.node_config,
119
+ )
120
+ case EngineType.MEGATRON_DEEPSPEED:
121
+ return MegatronDeepSpeedEngine(
122
+ model_config=self.model_config,
123
+ training_config=self.training_config,
124
+ parallelism_config=self.parallelism_config,
125
+ engine_config=self.engine_config,
126
+ gpu_config=self.gpu_config,
127
+ node_config=self.node_config,
128
+ )
129
+ case _:
130
+ # Default to PyTorch DDP
131
+ return PyTorchDDPEngine(
132
+ model_config=self.model_config,
133
+ training_config=self.training_config,
134
+ parallelism_config=self.parallelism_config,
135
+ engine_config=self.engine_config,
136
+ gpu_config=self.gpu_config,
137
+ node_config=self.node_config,
138
+ )
139
+
140
+ @classmethod
141
+ def from_config_file(
142
+ cls,
143
+ config_path: str,
144
+ ) -> "GPUMemoryCalculator":
145
+ """Create calculator from configuration file.
146
+
147
+ Args:
148
+ config_path: Path to JSON configuration file
149
+
150
+ Returns:
151
+ Configured GPUMemoryCalculator instance
152
+ """
153
+ model_config, training_config, parallelism_config, engine_config, gpu_config = (
154
+ ConfigParser.parse_full_config(config_path)
155
+ )
156
+
157
+ return cls(
158
+ model_config=model_config,
159
+ training_config=training_config,
160
+ parallelism_config=parallelism_config,
161
+ engine_config=engine_config,
162
+ gpu_config=gpu_config,
163
+ )
164
+
165
+ def to_dict(self) -> dict:
166
+ """Export calculator configuration to dictionary.
167
+
168
+ Returns:
169
+ Dictionary with all configuration
170
+ """
171
+ return {
172
+ "model": self.model_config.model_dump(),
173
+ "training": self.training_config.model_dump(),
174
+ "parallelism": self.parallelism_config.model_dump(),
175
+ "engine": self.engine_config.model_dump(),
176
+ "hardware": self.gpu_config.model_dump(),
177
+ "multinode": self.node_config.model_dump(),
178
+ }
src/gpu_mem_calculator/core/formulas.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Memory calculation formulas.
2
+
3
+ This module contains the fundamental formulas for calculating GPU memory
4
+ requirements for LLM training.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass
11
+ class Precision:
12
+ """Precision information for a data type.
13
+
14
+ This is re-exported from utils.precision for convenience.
15
+ """
16
+
17
+ name: str
18
+ bits_per_param: int
19
+ bytes_per_param: float
20
+ is_integer: bool = False
21
+
22
+
23
+ def calculate_parameter_memory(
24
+ num_params: int,
25
+ dtype: str,
26
+ num_gpus: int = 1,
27
+ ) -> float:
28
+ """Calculate memory in GB for model parameters.
29
+
30
+ Args:
31
+ num_params: Number of model parameters
32
+ dtype: Data type (e.g., "fp32", "fp16", "bf16", "int8", "int4")
33
+ num_gpus: Number of GPUs for distribution
34
+
35
+ Returns:
36
+ Memory in GB
37
+ """
38
+ from gpu_mem_calculator.utils.precision import gb_from_params
39
+
40
+ # Parameters are distributed across GPUs in data parallel training
41
+ # But for tensor/pipeline parallel, each GPU holds a portion
42
+ # We'll handle parallelism in the engine implementations
43
+ return gb_from_params(num_params, dtype)
44
+
45
+
46
+ def calculate_gradient_memory(
47
+ num_params: int,
48
+ dtype: str,
49
+ ) -> float:
50
+ """Calculate memory in GB for gradients.
51
+
52
+ Gradients are typically stored in the same precision as parameters
53
+ for training (though updated in FP32).
54
+
55
+ Args:
56
+ num_params: Number of model parameters
57
+ dtype: Data type for gradients
58
+
59
+ Returns:
60
+ Memory in GB
61
+ """
62
+ from gpu_mem_calculator.utils.precision import gb_from_params
63
+
64
+ # Gradients are same size as parameters during training
65
+ return gb_from_params(num_params, dtype)
66
+
67
+
68
+ def calculate_optimizer_memory(
69
+ num_params: int,
70
+ optimizer: str,
71
+ ) -> float:
72
+ """Calculate memory in GB for optimizer states.
73
+
74
+ Args:
75
+ num_params: Number of model parameters
76
+ optimizer: Optimizer type (adam, adamw, sgd, adamw_8bit)
77
+
78
+ Returns:
79
+ Memory in GB (for FP32 optimizer states)
80
+ """
81
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
82
+
83
+ # Optimizer states are typically stored in FP32
84
+ # bytes_per_param = 4.0 # FP32
85
+
86
+ match optimizer.lower():
87
+ case "adam" | "adamw":
88
+ # Adam/AdamW optimizer states: 12 bytes per param
89
+ # - FP32 parameter copy: 4 bytes
90
+ # - Momentum (fp32): 4 bytes
91
+ # - Variance (fp32): 4 bytes
92
+ # Reference: https://blog.eleuther.ai/transformer-math/#optimizer-states
93
+ # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
94
+ # Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
95
+ optimizer_bytes_per_param = 12.0
96
+ case "adamw_8bit":
97
+ # 8-bit Adam: ~2 bytes per param (quantized states)
98
+ # Reference: bitsandbytes 8-bit optimizer
99
+ optimizer_bytes_per_param = 2.0
100
+ case "sgd":
101
+ # SGD: momentum (4 bytes) if using momentum, 0 if not
102
+ # Assuming momentum is used
103
+ optimizer_bytes_per_param = 4.0
104
+ case _:
105
+ # Default to Adam
106
+ optimizer_bytes_per_param = 12.0
107
+
108
+ total_bytes = num_params * optimizer_bytes_per_param
109
+ return gb_from_bytes(total_bytes)
110
+
111
+
112
+ def calculate_activation_memory(
113
+ batch_size: int,
114
+ seq_len: int,
115
+ hidden_size: int,
116
+ num_layers: int,
117
+ num_attention_heads: int,
118
+ tensor_parallel_size: int = 1,
119
+ activation_checkpointing: int = 0,
120
+ moe_enabled: bool = False,
121
+ num_experts: int = 1,
122
+ top_k: int = 1,
123
+ expert_intermediate_size: int | None = None,
124
+ ) -> float:
125
+ """Calculate approximate memory in GB for activations.
126
+
127
+ This provides an estimate based on transformer architecture. Actual
128
+ activation memory depends on many factors including the specific
129
+ model implementation and framework.
130
+
131
+ Reference: https://blog.eleuther.ai/transformer-math/#activations
132
+ Reference: https://arxiv.org/abs/2204.13323 ("Reducing Activation Recomputation
133
+ in Large Transformer Models")
134
+
135
+ According to EleutherAI Transformer Math 101, for selective activation
136
+ checkpointing (the most common approach), the formula is:
137
+
138
+ sbhL(10 + 24/t) bytes
139
+
140
+ Where:
141
+ - s = sequence length (seq_len)
142
+ - b = batch size (batch_size)
143
+ - h = hidden size (hidden_size)
144
+ - L = number of layers (num_layers)
145
+ - t = tensor parallel size (tensor_parallel_size)
146
+
147
+ This implementation uses a simplified heuristic that approximates
148
+ this formula: hidden_size * 16 bytes per token per layer. This
149
+ provides a reasonable estimate for typical model configurations
150
+ while being simple to understand and modify.
151
+
152
+ For MoE models, activation memory is reduced because only top_k experts
153
+ are active per token, not all experts.
154
+
155
+ Args:
156
+ batch_size: Batch size per GPU
157
+ seq_len: Sequence length
158
+ hidden_size: Hidden dimension size
159
+ num_layers: Number of transformer layers
160
+ num_attention_heads: Number of attention heads
161
+ tensor_parallel_size: Tensor parallelism degree
162
+ activation_checkpointing: Checkpointing level (0-4)
163
+ moe_enabled: Whether model uses Mixture of Experts
164
+ num_experts: Total number of experts (for MoE)
165
+ top_k: Number of active experts per token (for MoE)
166
+ expert_intermediate_size: Expert intermediate layer size (for MoE)
167
+
168
+ Returns:
169
+ Memory in GB
170
+ """
171
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
172
+
173
+ # Approximate activation memory per token per layer
174
+ # Based on EleutherAI formula: sbhL(10 + 24/t)
175
+ # For t=1: ~10-24 bytes per token per layer depending on architecture
176
+ # We use 16 as a middle-ground estimate
177
+ # This includes attention outputs, MLP activations, layer norms, etc.
178
+
179
+ bytes_per_token_per_layer = hidden_size * 16 # Heuristic estimate
180
+
181
+ # For MoE models, adjust activation memory based on active experts
182
+ moe_multiplier = 1.0
183
+ if moe_enabled and num_experts > 1:
184
+ # Only top_k experts are active per token
185
+ # Base ratio of active experts
186
+ expert_ratio = top_k / num_experts
187
+
188
+ # Add router overhead (gating network activations)
189
+ router_overhead = 0.1
190
+
191
+ moe_multiplier = min(1.0, expert_ratio + router_overhead)
192
+
193
+ # For MoE, experts typically have larger intermediate sizes
194
+ if moe_enabled and expert_intermediate_size:
195
+ # Scale up slightly for larger expert intermediate layers
196
+ # Typical expert intermediate size is 4x hidden_size (vs 2x for dense)
197
+ size_ratio = expert_intermediate_size / (hidden_size * 2)
198
+ moe_multiplier *= min(2.0, size_ratio) # Cap at 2x increase
199
+
200
+ # Total activation memory
201
+ total_bytes = (
202
+ batch_size
203
+ * seq_len
204
+ * num_layers
205
+ * bytes_per_token_per_layer
206
+ * moe_multiplier
207
+ / tensor_parallel_size
208
+ )
209
+
210
+ # Adjust for activation checkpointing
211
+ # Level 0: No checkpointing (100% memory)
212
+ # Level 1: Checkpoint attention output (~80% memory)
213
+ # Level 2: Checkpoint attention input (~60% memory)
214
+ # Level 3: Checkpoint more (~40% memory)
215
+ # Level 4: Full checkpointing (~20% memory)
216
+ checkpoint_factors = [1.0, 0.8, 0.6, 0.4, 0.2]
217
+ checkpoint_factor = checkpoint_factors[min(activation_checkpointing, 4)]
218
+
219
+ total_bytes *= checkpoint_factor
220
+
221
+ return gb_from_bytes(total_bytes)
222
+
223
+
224
+ def calculate_overhead(
225
+ total_memory: float,
226
+ overhead_factor: float = 0.2,
227
+ ) -> float:
228
+ """Calculate additional memory overhead.
229
+
230
+ This accounts for CUDA context, fragmentation, temporary buffers, etc.
231
+
232
+ Args:
233
+ total_memory: Total calculated memory in GB
234
+ overhead_factor: Fraction to add for overhead (default 20%)
235
+
236
+ Returns:
237
+ Overhead memory in GB
238
+ """
239
+ return total_memory * overhead_factor
240
+
241
+
242
+ def estimate_largest_layer_params(
243
+ hidden_size: int,
244
+ num_attention_heads: int,
245
+ intermediate_size: int | None = None,
246
+ ) -> int:
247
+ """Estimate the largest layer parameters for ZeRO-3 calculations.
248
+
249
+ The largest layer is typically the MLP layer or attention projection.
250
+
251
+ Args:
252
+ hidden_size: Hidden dimension size
253
+ num_attention_heads: Number of attention heads
254
+ intermediate_size: MLP intermediate size (default 4 * hidden_size)
255
+
256
+ Returns:
257
+ Estimated number of parameters in the largest layer
258
+ """
259
+ if intermediate_size is None:
260
+ intermediate_size = 4 * hidden_size
261
+
262
+ # MLP layer: hidden_size * intermediate_size * 2 (for up and down projections)
263
+ mlp_params = hidden_size * intermediate_size * 2
264
+
265
+ # Attention output projection: hidden_size * hidden_size
266
+ attn_params = hidden_size * hidden_size
267
+
268
+ return max(mlp_params, attn_params)
src/gpu_mem_calculator/core/models.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for GPU memory calculation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from typing import Literal, cast
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
9
+ from pydantic_core.core_schema import ValidationInfo as FieldValidationInfo
10
+
11
+
12
+ class EngineType(str, Enum):
13
+ """Supported training engine types."""
14
+
15
+ PYTORCH_DDP = "pytorch_ddp"
16
+ DEEPSPEED = "deepspeed"
17
+ MEGATRON_LM = "megatron_lm"
18
+ FSDP = "fsdp"
19
+ MEGATRON_DEEPSPEED = "megatron_deepspeed"
20
+
21
+
22
+ class InferenceEngineType(str, Enum):
23
+ """Supported inference engine types."""
24
+
25
+ HUGGINGFACE = "huggingface"
26
+ VLLM = "vllm"
27
+ TGI = "tgi"
28
+ TENSORRT_LLM = "tensorrt_llm"
29
+ TRTLLM = "trtllm"
30
+ SGLANG = "sglang"
31
+
32
+
33
+ class OptimizerType(str, Enum):
34
+ """Supported optimizer types."""
35
+
36
+ ADAM = "adam"
37
+ ADAMW = "adamw"
38
+ SGD = "sgd"
39
+ ADAMW_8BIT = "adamw_8bit"
40
+
41
+
42
+ class DType(str, Enum):
43
+ """Supported data types."""
44
+
45
+ FP32 = "fp32"
46
+ FP16 = "fp16"
47
+ BF16 = "bf16"
48
+ INT8 = "int8"
49
+ INT4 = "int4"
50
+
51
+
52
+ class OffloadDevice(str, Enum):
53
+ """CPU offload options."""
54
+
55
+ NONE = "none"
56
+ CPU = "cpu"
57
+ NVME = "nvme"
58
+
59
+
60
+ class ModelConfig(BaseModel):
61
+ """Model architecture configuration."""
62
+
63
+ name: str = Field(default="custom", description="Model name")
64
+ num_parameters: int = Field(gt=0, description="Total number of parameters")
65
+ num_layers: int = Field(gt=0, description="Number of transformer layers")
66
+ hidden_size: int = Field(gt=0, description="Hidden dimension size")
67
+ num_attention_heads: int = Field(gt=0, description="Number of attention heads")
68
+ vocab_size: int = Field(default=32000, gt=0, description="Vocabulary size")
69
+ max_seq_len: int = Field(default=2048, gt=0, description="Maximum sequence length")
70
+ largest_layer_params: int | None = Field(
71
+ default=None,
72
+ gt=0,
73
+ description="Largest layer parameters (auto-calculated if not provided)",
74
+ )
75
+
76
+ # MoE (Mixture of Experts) parameters
77
+ moe_enabled: bool = Field(default=False, description="Enable Mixture of Experts")
78
+ num_experts: int = Field(default=8, ge=1, description="Number of experts in MoE")
79
+ top_k: int = Field(default=2, ge=1, description="Number of experts activated per token (top-k)")
80
+ expert_intermediate_size: int | None = Field(
81
+ default=None,
82
+ gt=0,
83
+ description="Expert intermediate layer size (defaults to 4x hidden_size)",
84
+ )
85
+ shared_expert_intermediate_size: int | None = Field(
86
+ default=None,
87
+ gt=0,
88
+ description="Shared expert intermediate size (for models like GLM with shared experts)",
89
+ )
90
+
91
+ @model_validator(mode="after")
92
+ def calculate_largest_layer(self) -> ModelConfig:
93
+ """Calculate largest layer params if not provided."""
94
+ if self.largest_layer_params is not None:
95
+ return self
96
+ # Calculate it
97
+ hidden = self.hidden_size
98
+ moe_enabled = self.moe_enabled
99
+
100
+ if hidden and moe_enabled:
101
+ # For MoE: largest layer includes expert parameters
102
+ expert_intermediate = self.expert_intermediate_size or hidden * 4
103
+ self.largest_layer_params = int(hidden * expert_intermediate * 2)
104
+ elif hidden:
105
+ # Dense model: attention output + MLP
106
+ self.largest_layer_params = int(hidden * hidden * 4)
107
+ return self
108
+
109
+ @property
110
+ def effective_num_experts(self) -> int:
111
+ """Get effective number of experts (returns 1 if MoE disabled)."""
112
+ return self.num_experts if self.moe_enabled else 1
113
+
114
+ @property
115
+ def active_experts(self) -> int:
116
+ """Get number of active experts per token (top_k or 1 if dense)."""
117
+ return self.top_k if self.moe_enabled else 1
118
+
119
+
120
+ class TrainingConfig(BaseModel):
121
+ """Training hyperparameters configuration."""
122
+
123
+ batch_size: int = Field(default=1, gt=0, description="Batch size per GPU")
124
+ gradient_accumulation_steps: int = Field(
125
+ default=1,
126
+ gt=0,
127
+ description="Gradient accumulation steps",
128
+ )
129
+ optimizer: OptimizerType = Field(default=OptimizerType.ADAMW, description="Optimizer type")
130
+ dtype: DType = Field(default=DType.BF16, description="Data type for training")
131
+ activation_checkpointing: int = Field(
132
+ default=0,
133
+ ge=0,
134
+ le=4,
135
+ description="Activation checkpointing level (0-4)",
136
+ )
137
+
138
+ @property
139
+ def effective_batch_size(self) -> int:
140
+ """Calculate effective batch size with gradient accumulation."""
141
+ return self.batch_size * self.gradient_accumulation_steps
142
+
143
+
144
+ class ParallelismConfig(BaseModel):
145
+ """Parallelism configuration."""
146
+
147
+ tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
148
+ pipeline_parallel_size: int = Field(default=1, ge=1, description="Pipeline parallelism degree")
149
+ data_parallel_size: int = Field(default=1, ge=1, description="Data parallelism degree")
150
+ sequence_parallel: bool = Field(default=False, description="Enable sequence parallelism")
151
+
152
+ @property
153
+ def total_parallel_size(self) -> int:
154
+ """Calculate total parallelism degree."""
155
+ return self.tensor_parallel_size * self.pipeline_parallel_size * self.data_parallel_size
156
+
157
+
158
+ class EngineConfig(BaseModel):
159
+ """Training engine specific configuration."""
160
+
161
+ type: EngineType = Field(default=EngineType.PYTORCH_DDP, description="Training engine type")
162
+ zero_stage: int | None = Field(
163
+ default=None,
164
+ ge=0,
165
+ le=3,
166
+ description="DeepSpeed ZeRO stage (only for DeepSpeed engine)",
167
+ )
168
+ offload_optimizer: OffloadDevice = Field(
169
+ default=OffloadDevice.NONE,
170
+ description="CPU offload for optimizer states",
171
+ )
172
+ offload_param: OffloadDevice = Field(
173
+ default=OffloadDevice.NONE,
174
+ description="CPU offload for parameters",
175
+ )
176
+ zero_init: bool = Field(
177
+ default=True,
178
+ description="Use ZeRO initialization (only for DeepSpeed ZeRO-3)",
179
+ )
180
+ sharding_strategy: Literal["no_shard", "shard_grad_op", "full_shard"] = Field(
181
+ default="full_shard",
182
+ description="FSDP sharding strategy",
183
+ )
184
+
185
+
186
+ class GPUConfig(BaseModel):
187
+ """Hardware configuration."""
188
+
189
+ num_gpus: int = Field(default=1, ge=1, description="Number of GPUs")
190
+ gpu_memory_gb: float = Field(default=80.0, gt=0, description="GPU memory in GB")
191
+ total_gpu_memory_gb: float | None = Field(
192
+ default=None,
193
+ description="Total GPU memory (calculated if not provided)",
194
+ )
195
+
196
+ @field_validator("total_gpu_memory_gb")
197
+ @classmethod
198
+ def calculate_total_memory(cls, v: float | None, info: FieldValidationInfo) -> float | None:
199
+ """Calculate total GPU memory if not provided."""
200
+ if v is None:
201
+ num_gpus = cast(int, info.data.get("num_gpus", 1))
202
+ gpu_mem = cast(float, info.data.get("gpu_memory_gb", 80.0))
203
+ return num_gpus * gpu_mem
204
+ return v
205
+
206
+
207
+ class InterconnectType(str, Enum):
208
+ """Multi-node interconnect types."""
209
+
210
+ INFINIBAND = "infiniband"
211
+ NVLINK = "nvlink"
212
+ ETHERNET_10G = "ethernet_10g"
213
+ ETHERNET_25G = "ethernet_25g"
214
+ ETHERNET_100G = "ethernet_100g"
215
+ ETHERNET_200G = "ethernet_200g"
216
+
217
+
218
+ class NodeConfig(BaseModel):
219
+ """Multi-node configuration."""
220
+
221
+ num_nodes: int = Field(default=1, ge=1, description="Number of nodes")
222
+ gpus_per_node: int | None = Field(
223
+ default=None,
224
+ ge=1,
225
+ description="GPUs per node (calculated from num_gpus if not provided)",
226
+ )
227
+ interconnect_type: InterconnectType = Field(
228
+ default=InterconnectType.INFINIBAND,
229
+ description="Interconnect type between nodes",
230
+ )
231
+ interconnect_bandwidth_gbps: float | None = Field(
232
+ default=None,
233
+ gt=0,
234
+ description="Interconnect bandwidth in Gbps (default: auto from type)",
235
+ )
236
+
237
+ @field_validator("gpus_per_node")
238
+ @classmethod
239
+ def calculate_gpus_per_node(cls, v: int | None, info: FieldValidationInfo) -> int | None:
240
+ """Calculate GPUs per node if not provided."""
241
+ if v is None:
242
+ num_nodes = cast(int, info.data.get("num_nodes", 1))
243
+ num_gpus = cast(int, info.data.get("num_gpus", 1))
244
+ return max(1, num_gpus // num_nodes)
245
+ return v
246
+
247
+ def get_interconnect_bandwidth_gbps(self) -> float:
248
+ """Get interconnect bandwidth in Gbps.
249
+
250
+ Returns bandwidth from config or default based on interconnect type.
251
+ """
252
+ if self.interconnect_bandwidth_gbps:
253
+ return self.interconnect_bandwidth_gbps
254
+
255
+ # Default bandwidth values for each interconnect type
256
+ bandwidth_defaults = {
257
+ InterconnectType.INFINIBAND: 200.0, # HDR200 InfiniBand
258
+ InterconnectType.NVLINK: 300.0, # NVLink/NVSwitch
259
+ InterconnectType.ETHERNET_10G: 10.0,
260
+ InterconnectType.ETHERNET_25G: 25.0,
261
+ InterconnectType.ETHERNET_100G: 100.0,
262
+ InterconnectType.ETHERNET_200G: 200.0,
263
+ }
264
+ return bandwidth_defaults.get(self.interconnect_type, 100.0)
265
+
266
+ @property
267
+ def is_multi_node(self) -> bool:
268
+ """Check if this is a multi-node configuration."""
269
+ return self.num_nodes > 1
270
+
271
+
272
+ class NetworkOverhead(BaseModel):
273
+ """Network communication overhead for multi-node training."""
274
+
275
+ allreduce_gb: float = Field(default=0.0, ge=0, description="AllReduce communication in GB")
276
+ allgather_gb: float = Field(default=0.0, ge=0, description="AllGather communication in GB")
277
+ reducescatter_gb: float = Field(
278
+ default=0.0, ge=0, description="ReduceScatter communication in GB"
279
+ )
280
+ point_to_point_gb: float = Field(
281
+ default=0.0, ge=0, description="Point-to-point communication in GB"
282
+ )
283
+ total_overhead_gb: float = Field(default=0.0, ge=0, description="Total network overhead in GB")
284
+ estimated_overhead_ms_per_step: float | None = Field(
285
+ default=None,
286
+ description="Estimated communication overhead per training step in milliseconds",
287
+ )
288
+
289
+
290
+ class HybridParallelismConfig(BaseModel):
291
+ """Hybrid parallelism configuration for optimal multi-node scaling."""
292
+
293
+ auto_optimize: bool = Field(
294
+ default=False,
295
+ description="Automatically optimize parallelism strategy for given hardware",
296
+ )
297
+ target_gpu_utilization: float = Field(
298
+ default=0.85,
299
+ gt=0.0,
300
+ le=1.0,
301
+ description="Target GPU memory utilization (0.0-1.0)",
302
+ )
303
+ prefer_pipeline_parallel: bool = Field(
304
+ default=False,
305
+ description="Prefer pipeline parallelism over data parallel for multi-node",
306
+ )
307
+ max_pipeline_chunks: int | None = Field(
308
+ default=None,
309
+ ge=1,
310
+ description="Maximum number of pipeline chunks (virtual stages)",
311
+ )
312
+ enable_sequence_parallel: bool = Field(
313
+ default=True,
314
+ description="Enable sequence parallelism for long sequences",
315
+ )
316
+ sequence_parallel_threshold: int = Field(
317
+ default=4096,
318
+ ge=1,
319
+ description="Sequence length threshold for enabling sequence parallel",
320
+ )
321
+
322
+
323
+ class MemoryBreakdown(BaseModel):
324
+ """Memory calculation result breakdown."""
325
+
326
+ model_config = ConfigDict(protected_namespaces=())
327
+
328
+ model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
329
+ gradients_gb: float = Field(ge=0, description="Gradients memory in GB")
330
+ optimizer_states_gb: float = Field(ge=0, description="Optimizer states memory in GB")
331
+ activations_gb: float = Field(ge=0, description="Activations memory in GB")
332
+ overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
333
+
334
+ @property
335
+ def total_memory_gb(self) -> float:
336
+ """Total memory in GB."""
337
+ return (
338
+ self.model_params_gb
339
+ + self.gradients_gb
340
+ + self.optimizer_states_gb
341
+ + self.activations_gb
342
+ + self.overhead_gb
343
+ )
344
+
345
+
346
+ class MemoryResult(BaseModel):
347
+ """Complete memory calculation result."""
348
+
349
+ total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
350
+ total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
351
+ cpu_memory_gb: float = Field(default=0.0, ge=0, description="CPU memory required in GB")
352
+ breakdown: MemoryBreakdown = Field(description="Memory breakdown by component")
353
+ network_overhead: NetworkOverhead | None = Field(
354
+ default=None,
355
+ description="Network communication overhead for multi-node training",
356
+ )
357
+ fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
358
+ memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
359
+ recommended_batch_size: int | None = Field(
360
+ default=None,
361
+ description="Recommended batch size if current doesn't fit",
362
+ )
363
+ multi_node_info: dict | None = Field(
364
+ default=None,
365
+ description="Additional multi-node configuration info",
366
+ )
367
+
368
+
369
+ class KVCacheQuantization(str, Enum):
370
+ """KV cache quantization options."""
371
+
372
+ NONE = "none"
373
+ INT8 = "int8"
374
+ FP8 = "fp8"
375
+ INT4 = "int4"
376
+
377
+
378
+ class InferenceMemoryBreakdown(BaseModel):
379
+ """Memory breakdown for inference workloads."""
380
+
381
+ model_config = ConfigDict(protected_namespaces=())
382
+
383
+ model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
384
+ kv_cache_gb: float = Field(ge=0, description="KV cache memory in GB")
385
+ activations_gb: float = Field(ge=0, description="Activation memory in GB")
386
+ overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
387
+
388
+ @property
389
+ def total_memory_gb(self) -> float:
390
+ """Total memory in GB."""
391
+ return self.model_params_gb + self.kv_cache_gb + self.activations_gb + self.overhead_gb
392
+
393
+
394
+ class InferenceConfig(BaseModel):
395
+ """Inference-specific configuration."""
396
+
397
+ batch_size: int = Field(default=1, gt=0, description="Batch size for inference")
398
+ max_seq_len: int | None = Field(
399
+ default=None,
400
+ gt=0,
401
+ description="Override max sequence length for inference (default: use model config)",
402
+ )
403
+ kv_cache_quantization: KVCacheQuantization = Field(
404
+ default=KVCacheQuantization.NONE,
405
+ description="KV cache quantization type",
406
+ )
407
+ use_kv_cache: bool = Field(default=True, description="Enable KV cache for generation")
408
+ tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
409
+ enable_streaming: bool = Field(default=False, description="Enable streaming inference")
410
+
411
+ # Common inference options
412
+ gpu_memory_utilization: float = Field(
413
+ default=0.9,
414
+ gt=0.0,
415
+ le=1.0,
416
+ description="GPU memory utilization target (0.0-1.0)",
417
+ )
418
+
419
+ # TGI-specific options
420
+ max_total_tokens: int | None = Field(
421
+ default=None,
422
+ gt=0,
423
+ description="TGI: Maximum total tokens (input + output) - defines memory budget",
424
+ )
425
+ max_input_tokens: int | None = Field(
426
+ default=None,
427
+ gt=0,
428
+ description="TGI: Maximum input tokens",
429
+ )
430
+ max_batch_total_tokens: int | None = Field(
431
+ default=None,
432
+ gt=0,
433
+ description="TGI: Maximum total tokens across all batches",
434
+ )
435
+ tgi_quantize: Literal[
436
+ "none",
437
+ "awq",
438
+ "eetq",
439
+ "exl2",
440
+ "gptq",
441
+ "marlin",
442
+ "bitsandbytes",
443
+ "bitsandbytes-nf4",
444
+ "bitsandbytes-fp4",
445
+ "fp8",
446
+ ] = Field(
447
+ default="none",
448
+ description="TGI: Weight quantization method",
449
+ )
450
+ tgi_dtype: Literal["float16", "bfloat16"] = Field(
451
+ default="bfloat16",
452
+ description="TGI: Data type for inference",
453
+ )
454
+ sharded: bool = Field(default=False, description="TGI: Enable sharded inference")
455
+ num_shard: int | None = Field(
456
+ default=None,
457
+ ge=1,
458
+ description="TGI: Number of shards for sharded inference",
459
+ )
460
+
461
+ # vLLM-specific options
462
+ block_size: int | None = Field(
463
+ default=None,
464
+ ge=1,
465
+ description="vLLM: Block size for KV cache management (default: 16)",
466
+ )
467
+ swap_space_gb: float = Field(default=0.0, ge=0.0, description="vLLM: CPU swap space in GB")
468
+ enable_prefix_caching: bool = Field(default=False, description="vLLM: Enable prefix caching")
469
+ enforce_eager: bool = Field(
470
+ default=False,
471
+ description="vLLM: Enable eager mode (disable CUDA graph)",
472
+ )
473
+ max_num_batched_tokens: int | None = Field(
474
+ default=None,
475
+ gt=0,
476
+ description="vLLM: Maximum number of batched tokens",
477
+ )
478
+ max_num_seqs: int | None = Field(
479
+ default=None,
480
+ gt=0,
481
+ description="vLLM: Maximum number of sequences in a batch",
482
+ )
483
+ vllm_quantization: Literal["none", "awq", "gptq", "squeezellm", "fp8"] = Field(
484
+ default="none",
485
+ description="vLLM: Weight quantization method",
486
+ )
487
+
488
+ # TensorRT-LLM-specific options
489
+ trt_max_batch_size: int | None = Field(
490
+ default=None,
491
+ gt=0,
492
+ description="TensorRT-LLM: Maximum batch size",
493
+ )
494
+ trt_max_input_len: int | None = Field(
495
+ default=None,
496
+ gt=0,
497
+ description="TensorRT-LLM: Maximum input length",
498
+ )
499
+ trt_max_seq_len: int | None = Field(
500
+ default=None,
501
+ gt=0,
502
+ description="TensorRT-LLM: Maximum sequence length",
503
+ )
504
+ trt_max_beam_width: int | None = Field(
505
+ default=None,
506
+ ge=1,
507
+ description="TensorRT-LLM: Maximum beam width for beam search",
508
+ )
509
+
510
+ # SGLang-specific options
511
+ chunk_size: int | None = Field(
512
+ default=None,
513
+ ge=1,
514
+ description="SGLang: Prefill chunk size for long contexts (default: 8192)",
515
+ )
516
+ max_running_requests: int | None = Field(
517
+ default=None,
518
+ ge=1,
519
+ description="SGLang: Maximum number of concurrent requests",
520
+ )
521
+ disable_radix_cache: bool = Field(
522
+ default=False,
523
+ description="SGLang: Disable RadixAttention cache (for debugging)",
524
+ )
525
+ enable_p2p: bool = Field(
526
+ default=False,
527
+ description="SGLang: Enable P2P attention for multi-GPU",
528
+ )
529
+ disable_custom_all_reduce: bool = Field(
530
+ default=False,
531
+ description="SGLang: Disable custom all-reduce kernel",
532
+ )
533
+ attention_backend: Literal["flashinfer", "triton", "torch"] = Field(
534
+ default="flashinfer",
535
+ description="SGLang: Attention backend implementation",
536
+ )
537
+ enable_torch_compile: bool = Field(
538
+ default=False,
539
+ description="SGLang: Enable torch.compile for model optimization",
540
+ )
541
+ radix_cache_max_seq_len: int | None = Field(
542
+ default=None,
543
+ gt=0,
544
+ description="SGLang: Maximum sequence length for RadixCache",
545
+ )
546
+ speculative_algo: Literal["default", "medusa", "eagle"] = Field(
547
+ default="default",
548
+ description="SGLang: Speculative decoding algorithm",
549
+ )
550
+ multi_lora_enabled: bool = Field(default=False, description="SGLang: Enable multi-LoRA serving")
551
+
552
+
553
+ class InferenceMemoryResult(BaseModel):
554
+ """Inference memory calculation result."""
555
+
556
+ total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
557
+ total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
558
+ breakdown: InferenceMemoryBreakdown = Field(description="Memory breakdown by component")
559
+ fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
560
+ memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
561
+ max_supported_batch_size: int | None = Field(
562
+ default=None,
563
+ description="Maximum batch size that fits in GPU memory",
564
+ )
565
+ estimated_throughput_tokens_per_sec: float | None = Field(
566
+ default=None,
567
+ description="Estimated throughput in tokens/second",
568
+ )
src/gpu_mem_calculator/core/multinode.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Multi-node training calculator.
2
+
3
+ Handles network communication overhead calculation and hybrid
4
+ parallelism optimization for multi-node training configurations.
5
+ """
6
+
7
+ from gpu_mem_calculator.core.models import (
8
+ EngineConfig,
9
+ EngineType,
10
+ HybridParallelismConfig,
11
+ ModelConfig,
12
+ NetworkOverhead,
13
+ NodeConfig,
14
+ ParallelismConfig,
15
+ TrainingConfig,
16
+ )
17
+
18
+
19
+ class MultiNodeCalculator:
20
+ """Calculator for multi-node training overhead and optimization.
21
+
22
+ This class provides:
23
+ - Network communication overhead estimation
24
+ - Hybrid parallelism strategy optimization
25
+ - Multi-node performance modeling
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ model_config: ModelConfig,
31
+ training_config: TrainingConfig,
32
+ parallelism_config: ParallelismConfig,
33
+ node_config: NodeConfig,
34
+ engine_config: EngineConfig,
35
+ ) -> None:
36
+ """Initialize the multi-node calculator.
37
+
38
+ Args:
39
+ model_config: Model architecture configuration
40
+ training_config: Training hyperparameters
41
+ parallelism_config: Parallelism settings
42
+ node_config: Multi-node hardware configuration
43
+ engine_config: Training engine configuration
44
+ """
45
+ self.model_config = model_config
46
+ self.training_config = training_config
47
+ self.parallelism_config = parallelism_config
48
+ self.node_config = node_config
49
+ self.engine_config = engine_config
50
+
51
+ def calculate_network_overhead(self) -> NetworkOverhead:
52
+ """Calculate network communication overhead for multi-node training.
53
+
54
+ Estimates communication overhead for different collective operations
55
+ based on model size, parallelism strategy, and interconnect bandwidth.
56
+
57
+ Returns:
58
+ NetworkOverhead with detailed breakdown
59
+ """
60
+ if not self.node_config.is_multi_node:
61
+ return NetworkOverhead()
62
+
63
+ # Get model size in bytes
64
+ model_params = self.model_config.num_parameters
65
+ dtype_bytes = self._get_dtype_bytes()
66
+ model_size_bytes = int(model_params * dtype_bytes)
67
+
68
+ # Calculate communication for each collective operation
69
+ allreduce_gb = self._calculate_allreduce_overhead(model_size_bytes)
70
+ allgather_gb = self._calculate_allgather_overhead(model_size_bytes)
71
+ reducescatter_gb = self._calculate_reducescatter_overhead(model_size_bytes)
72
+ point_to_point_gb = self._calculate_pipeline_overhead(model_size_bytes)
73
+
74
+ total_overhead_gb = allreduce_gb + allgather_gb + reducescatter_gb + point_to_point_gb
75
+
76
+ # Estimate time overhead per step
77
+ overhead_ms = self._estimate_communication_time_ms(total_overhead_gb)
78
+
79
+ return NetworkOverhead(
80
+ allreduce_gb=allreduce_gb,
81
+ allgather_gb=allgather_gb,
82
+ reducescatter_gb=reducescatter_gb,
83
+ point_to_point_gb=point_to_point_gb,
84
+ total_overhead_gb=total_overhead_gb,
85
+ estimated_overhead_ms_per_step=overhead_ms,
86
+ )
87
+
88
+ def optimize_hybrid_parallelism(
89
+ self,
90
+ hybrid_config: HybridParallelismConfig,
91
+ ) -> ParallelismConfig:
92
+ """Optimize hybrid parallelism strategy for multi-node training.
93
+
94
+ Analyzes the hardware configuration and model characteristics
95
+ to recommend optimal parallelism degrees.
96
+
97
+ Args:
98
+ hybrid_config: Hybrid parallelism configuration and preferences
99
+
100
+ Returns:
101
+ Optimized ParallelismConfig
102
+ """
103
+ if not hybrid_config.auto_optimize:
104
+ return self.parallelism_config
105
+
106
+ num_nodes = self.node_config.num_nodes
107
+ gpus_per_node = self.node_config.gpus_per_node or 1
108
+ total_gpus = num_nodes * gpus_per_node
109
+
110
+ seq_len = self.model_config.max_seq_len
111
+
112
+ # Determine optimal parallelism strategy
113
+ if seq_len >= hybrid_config.sequence_parallel_threshold:
114
+ # Enable sequence parallel for long sequences
115
+ enable_sp = True
116
+ else:
117
+ enable_sp = hybrid_config.enable_sequence_parallel
118
+
119
+ # Calculate parallelism degrees
120
+ if hybrid_config.prefer_pipeline_parallel and num_nodes > 1:
121
+ # Prefer pipeline parallel across nodes
122
+ pp_size = int(min(num_nodes, 8)) # Limit pipeline stages
123
+ tp_size = int(min(gpus_per_node, 8)) # Tensor parallel within node
124
+ dp_size = int(total_gpus // (pp_size * tp_size))
125
+ else:
126
+ # Default: maximize data parallel
127
+ tp_size = 1
128
+ pp_size = 1
129
+ dp_size = int(total_gpus)
130
+
131
+ # Ensure all values are at least 1
132
+ tp_size = max(1, tp_size)
133
+ pp_size = max(1, pp_size)
134
+ dp_size = max(1, dp_size)
135
+
136
+ return ParallelismConfig(
137
+ tensor_parallel_size=tp_size,
138
+ pipeline_parallel_size=pp_size,
139
+ data_parallel_size=dp_size,
140
+ sequence_parallel=enable_sp,
141
+ )
142
+
143
+ def _calculate_allreduce_overhead(self, model_size_bytes: int) -> float:
144
+ """Calculate AllReduce communication overhead.
145
+
146
+ AllReduce is used for gradient averaging in data parallel training.
147
+ Algorithm: Ring AllReduce with O(2 * model_size) communication.
148
+
149
+ Args:
150
+ model_size_bytes: Model size in bytes
151
+
152
+ Returns:
153
+ Communication volume in GB
154
+ """
155
+ # Ring AllReduce: each GPU sends/receives 2 * model_size / num_gpus
156
+ # But we need the total across the network
157
+
158
+ # For gradient averaging: 2 * model_size (send + receive)
159
+ allreduce_bytes = 2 * model_size_bytes
160
+
161
+ # Adjust for collective operation efficiency
162
+ # In multi-node, cross-node traffic is the bottleneck
163
+ if self.node_config.is_multi_node:
164
+ # Only cross-node traffic matters
165
+ allreduce_bytes = int(allreduce_bytes / self.node_config.num_nodes)
166
+
167
+ return allreduce_bytes / (1024**3)
168
+
169
+ def _calculate_allgather_overhead(self, model_size_bytes: int) -> float:
170
+ """Calculate AllGather communication overhead.
171
+
172
+ AllGather is used in ZeRO-3 and tensor parallel for parameter gathering.
173
+
174
+ Args:
175
+ model_size_bytes: Model size in bytes
176
+
177
+ Returns:
178
+ Communication volume in GB
179
+ """
180
+ # AllGather: (num_gpus - 1) * model_size / num_gpus per GPU
181
+ # But for ZeRO-3, we gather all parameters
182
+ is_zero3 = (
183
+ self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 3
184
+ )
185
+
186
+ if is_zero3:
187
+ # ZeRO-3 gathers all parameters during forward pass
188
+ allgather_bytes = model_size_bytes
189
+ else:
190
+ # Standard allgather for tensor parallel
191
+ allgather_bytes = int(model_size_bytes / self.parallelism_config.tensor_parallel_size)
192
+
193
+ # Adjust for multi-node
194
+ if self.node_config.is_multi_node:
195
+ allgather_bytes = int(allgather_bytes / self.node_config.num_nodes)
196
+
197
+ return allgather_bytes / (1024**3)
198
+
199
+ def _calculate_reducescatter_overhead(self, model_size_bytes: int) -> float:
200
+ """Calculate ReduceScatter communication overhead.
201
+
202
+ ReduceScatter is used in ZeRO-2 and gradient sharding.
203
+
204
+ Args:
205
+ model_size_bytes: Model size in bytes
206
+
207
+ Returns:
208
+ Communication volume in GB
209
+ """
210
+ is_zero2 = (
211
+ self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 2
212
+ )
213
+
214
+ if is_zero2:
215
+ # ZeRO-2 scatters gradients
216
+ reducescatter_bytes = model_size_bytes
217
+ else:
218
+ # Standard reducescatter
219
+ reducescatter_bytes = int(model_size_bytes / self.parallelism_config.data_parallel_size)
220
+
221
+ # Adjust for multi-node
222
+ if self.node_config.is_multi_node:
223
+ reducescatter_bytes = int(reducescatter_bytes / self.node_config.num_nodes)
224
+
225
+ return reducescatter_bytes / (1024**3)
226
+
227
+ def _calculate_pipeline_overhead(self, model_size_bytes: int) -> float:
228
+ """Calculate pipeline parallel communication overhead.
229
+
230
+ Point-to-point communication between pipeline stages.
231
+
232
+ Args:
233
+ model_size_bytes: Model size in bytes
234
+
235
+ Returns:
236
+ Communication volume in GB
237
+ """
238
+ if self.parallelism_config.pipeline_parallel_size <= 1:
239
+ return 0.0
240
+
241
+ # Pipeline parallel sends activations between stages
242
+ # Approximate as layer activations
243
+ hidden_size = self.model_config.hidden_size
244
+ seq_len = self.model_config.max_seq_len
245
+ batch_size = self.training_config.batch_size
246
+ num_layers = self.model_config.num_layers
247
+
248
+ # Activation size per layer
249
+ activation_bytes = batch_size * seq_len * hidden_size * 2 # FP16/BF16
250
+
251
+ # Number of microbatches determines communication frequency
252
+ # For simplicity, assume num_stages communications per step
253
+ pp_size = self.parallelism_config.pipeline_parallel_size
254
+ pipeline_comm_bytes = activation_bytes * (num_layers // pp_size)
255
+
256
+ # Adjust for multi-node
257
+ if self.node_config.is_multi_node:
258
+ pipeline_comm_bytes = int(pipeline_comm_bytes / self.node_config.num_nodes)
259
+
260
+ return pipeline_comm_bytes / (1024**3)
261
+
262
+ def _estimate_communication_time_ms(self, total_gb: float) -> float:
263
+ """Estimate communication time per training step in milliseconds.
264
+
265
+ Args:
266
+ total_gb: Total communication volume in GB
267
+
268
+ Returns:
269
+ Estimated time in milliseconds
270
+ """
271
+ if total_gb == 0:
272
+ return 0.0
273
+
274
+ # Get bandwidth in GB/s
275
+ bandwidth_gbps = self.node_config.get_interconnect_bandwidth_gbps()
276
+ bandwidth_gbps_per_sec = bandwidth_gbps / 8 # Convert to GB/s
277
+
278
+ # Basic time = size / bandwidth
279
+ time_seconds = total_gb / bandwidth_gbps_per_sec
280
+
281
+ # Add latency overhead for collective operations
282
+ # Typical latency: 10-50 microseconds per hop
283
+ num_nodes = self.node_config.num_nodes
284
+ latency_overhead = num_nodes * 0.00005 # 50 microseconds per node
285
+
286
+ # Network efficiency factor (not 100% efficient)
287
+ efficiency = 0.85
288
+
289
+ total_time_seconds = (time_seconds / efficiency) + latency_overhead
290
+
291
+ return total_time_seconds * 1000 # Convert to ms
292
+
293
+ def _get_dtype_bytes(self) -> float:
294
+ """Get bytes per element based on dtype."""
295
+ dtype_map = {
296
+ "fp32": 4,
297
+ "fp16": 2,
298
+ "bf16": 2,
299
+ "int8": 1,
300
+ "int4": 0.5,
301
+ }
302
+ return dtype_map.get(self.training_config.dtype.value, 2)
303
+
304
+ def _calculate_model_size_gb(self) -> float:
305
+ """Calculate model size in GB."""
306
+ dtype_bytes = self._get_dtype_bytes()
307
+ model_size_bytes = self.model_config.num_parameters * dtype_bytes
308
+ return model_size_bytes / (1024**3)
src/gpu_mem_calculator/engines/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Training engine implementations."""
2
+
3
+ from gpu_mem_calculator.engines.base import BaseEngine
4
+ from gpu_mem_calculator.engines.deepspeed import DeepSpeedEngine
5
+ from gpu_mem_calculator.engines.fsdp import FSDPEngine
6
+ from gpu_mem_calculator.engines.megatron import MegatronDeepSpeedEngine, MegatronLMEngine
7
+ from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
8
+
9
+ __all__ = [
10
+ "BaseEngine",
11
+ "PyTorchDDPEngine",
12
+ "DeepSpeedEngine",
13
+ "MegatronLMEngine",
14
+ "MegatronDeepSpeedEngine",
15
+ "FSDPEngine",
16
+ ]
src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (688 Bytes). View file
 
src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc ADDED
Binary file (8.07 kB). View file
 
src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc ADDED
Binary file (11.2 kB). View file
 
src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc ADDED
Binary file (8.07 kB). View file
 
src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc ADDED
Binary file (8.5 kB). View file
 
src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc ADDED
Binary file (3.73 kB). View file
 
src/gpu_mem_calculator/engines/base.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base class for training engine implementations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from gpu_mem_calculator.core.models import (
6
+ EngineConfig,
7
+ GPUConfig,
8
+ MemoryBreakdown,
9
+ MemoryResult,
10
+ ModelConfig,
11
+ NodeConfig,
12
+ ParallelismConfig,
13
+ TrainingConfig,
14
+ )
15
+
16
+
17
+ class BaseEngine(ABC):
18
+ """Abstract base class for training engine memory calculation.
19
+
20
+ Each training engine (PyTorch DDP, DeepSpeed, Megatron-LM, etc.)
21
+ should implement this interface to provide engine-specific
22
+ memory calculations.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ model_config: ModelConfig,
28
+ training_config: TrainingConfig,
29
+ parallelism_config: ParallelismConfig,
30
+ engine_config: EngineConfig,
31
+ gpu_config: GPUConfig,
32
+ node_config: NodeConfig | None = None,
33
+ ) -> None:
34
+ """Initialize the engine with configuration.
35
+
36
+ Args:
37
+ model_config: Model architecture configuration
38
+ training_config: Training hyperparameters
39
+ parallelism_config: Parallelism settings
40
+ engine_config: Engine-specific configuration
41
+ gpu_config: Hardware configuration
42
+ node_config: Multi-node configuration (optional)
43
+ """
44
+ self.model_config = model_config
45
+ self.training_config = training_config
46
+ self.parallelism_config = parallelism_config
47
+ self.engine_config = engine_config
48
+ self.gpu_config = gpu_config
49
+ self.node_config = node_config or NodeConfig()
50
+
51
+ @abstractmethod
52
+ def calculate_memory(self) -> MemoryResult:
53
+ """Calculate memory requirements for this engine.
54
+
55
+ This is the main method that should be implemented by each engine.
56
+
57
+ Returns:
58
+ MemoryResult with complete memory breakdown
59
+ """
60
+ pass
61
+
62
+ def _check_feasibility(
63
+ self,
64
+ total_memory_per_gpu: float,
65
+ ) -> tuple[bool, float, int | None]:
66
+ """Check if the configuration fits on available GPU.
67
+
68
+ Args:
69
+ total_memory_per_gpu: Total memory required per GPU
70
+
71
+ Returns:
72
+ Tuple of (fits_on_gpu, utilization_percent, recommended_batch_size)
73
+ """
74
+ available_memory = self.gpu_config.gpu_memory_gb
75
+ utilization_percent = (total_memory_per_gpu / available_memory) * 100
76
+
77
+ fits_on_gpu = total_memory_per_gpu <= available_memory
78
+
79
+ # If doesn't fit, suggest a smaller batch size
80
+ recommended_batch_size = None
81
+ if not fits_on_gpu:
82
+ # Simple heuristic: scale batch size inversely with memory excess
83
+ excess_factor = total_memory_per_gpu / available_memory
84
+ recommended_batch_size = max(1, int(self.training_config.batch_size / excess_factor))
85
+
86
+ return fits_on_gpu, utilization_percent, recommended_batch_size
87
+
88
+ def _create_result(
89
+ self,
90
+ breakdown: MemoryBreakdown,
91
+ cpu_memory_gb: float = 0.0,
92
+ ) -> MemoryResult:
93
+ """Create a MemoryResult from breakdown.
94
+
95
+ Args:
96
+ breakdown: Memory breakdown by component
97
+ cpu_memory_gb: CPU memory required (default 0)
98
+
99
+ Returns:
100
+ Complete MemoryResult
101
+ """
102
+ total_memory_per_gpu = breakdown.total_memory_gb
103
+ total_memory_all_gpus = total_memory_per_gpu * self.gpu_config.num_gpus
104
+
105
+ fits_on_gpu, utilization_percent, recommended_batch_size = self._check_feasibility(
106
+ total_memory_per_gpu
107
+ )
108
+
109
+ # Calculate network overhead for multi-node configurations
110
+ network_overhead = None
111
+ multi_node_info = None
112
+ if self.node_config.is_multi_node:
113
+ from gpu_mem_calculator.core.multinode import MultiNodeCalculator
114
+
115
+ multinode_calc = MultiNodeCalculator(
116
+ model_config=self.model_config,
117
+ training_config=self.training_config,
118
+ parallelism_config=self.parallelism_config,
119
+ node_config=self.node_config,
120
+ engine_config=self.engine_config,
121
+ )
122
+ network_overhead = multinode_calc.calculate_network_overhead()
123
+
124
+ # Add multi-node info
125
+ multi_node_info = {
126
+ "num_nodes": self.node_config.num_nodes,
127
+ "gpus_per_node": self.node_config.gpus_per_node,
128
+ "interconnect_type": self.node_config.interconnect_type.value,
129
+ "interconnect_bandwidth_gbps": self.node_config.get_interconnect_bandwidth_gbps(),
130
+ }
131
+
132
+ return MemoryResult(
133
+ total_memory_per_gpu_gb=total_memory_per_gpu,
134
+ total_memory_all_gpus_gb=total_memory_all_gpus,
135
+ cpu_memory_gb=cpu_memory_gb,
136
+ breakdown=breakdown,
137
+ network_overhead=network_overhead,
138
+ fits_on_gpu=fits_on_gpu,
139
+ memory_utilization_percent=utilization_percent,
140
+ recommended_batch_size=recommended_batch_size,
141
+ multi_node_info=multi_node_info,
142
+ )
143
+
144
+ @property
145
+ def effective_batch_size(self) -> int:
146
+ """Calculate effective batch size with gradient accumulation."""
147
+ return (
148
+ self.training_config.batch_size
149
+ * self.training_config.gradient_accumulation_steps
150
+ * self.parallelism_config.data_parallel_size
151
+ )
152
+
153
+ @property
154
+ def total_num_gpus(self) -> int:
155
+ """Get total number of GPUs."""
156
+ return self.gpu_config.num_gpus
157
+
158
+ @property
159
+ def num_gpus_per_model(self) -> int:
160
+ """Get number of GPUs per model replica.
161
+
162
+ This is tensor_parallel * pipeline_parallel for distributed training.
163
+ """
164
+ return (
165
+ self.parallelism_config.tensor_parallel_size
166
+ * self.parallelism_config.pipeline_parallel_size
167
+ )
168
+
169
+ def calculate_moe_activation_multiplier(self) -> float:
170
+ """Calculate activation memory multiplier for MoE models.
171
+
172
+ For MoE models, activation memory depends on top_k (active experts per token)
173
+ rather than total number of experts. This is because only top_k experts
174
+ are activated per token during forward/backward pass.
175
+
176
+ Returns:
177
+ Multiplier for activation memory (1.0 for dense models, <1 for MoE)
178
+ """
179
+ if not self.model_config.moe_enabled:
180
+ return 1.0
181
+
182
+ # For MoE: only top_k experts are active per token
183
+ # Activation memory scales with active_experts / total_experts
184
+ # But we also have router overhead and gating network activations
185
+
186
+ num_experts = self.model_config.num_experts
187
+ top_k = self.model_config.top_k
188
+
189
+ # Base activation ratio: only top_k experts active
190
+ activation_ratio = top_k / num_experts
191
+
192
+ # Add router overhead (typically 5-15% extra for gating)
193
+ router_overhead = 0.1
194
+
195
+ # For models with shared experts (like GLM), adjust accordingly
196
+ if self.model_config.shared_expert_intermediate_size:
197
+ # Shared expert is always active, so add its contribution
198
+ # This is a simplified approximation
199
+ activation_ratio = activation_ratio + (1.0 / num_experts)
200
+
201
+ return min(1.0, activation_ratio + router_overhead)
202
+
203
+ def calculate_moe_parameter_ratio(self) -> float:
204
+ """Calculate effective parameter ratio for MoE models.
205
+
206
+ For MoE models, only top_k experts are used during forward pass,
207
+ but all expert parameters are stored in memory.
208
+
209
+ Returns:
210
+ Ratio of active parameters to total parameters (for memory estimation)
211
+ """
212
+ if not self.model_config.moe_enabled:
213
+ return 1.0
214
+
215
+ # All expert parameters are stored, but only top_k are used per token
216
+ # For gradient calculation, we need gradients for all experts
217
+ # So parameter storage = 1.0 (all params stored)
218
+ # But we can use this for inference-specific calculations
219
+
220
+ return 1.0 # All parameters stored in memory
src/gpu_mem_calculator/engines/deepspeed.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DeepSpeed ZeRO engine implementation.
2
+
3
+ Implements memory calculations for DeepSpeed ZeRO stages 1, 2, and 3.
4
+ Based on: https://deepspeed.readthedocs.io/en/latest/memory.html
5
+ """
6
+
7
+ from gpu_mem_calculator.core.formulas import (
8
+ calculate_activation_memory,
9
+ calculate_overhead,
10
+ estimate_largest_layer_params,
11
+ )
12
+ from gpu_mem_calculator.core.models import (
13
+ MemoryBreakdown,
14
+ MemoryResult,
15
+ OffloadDevice,
16
+ )
17
+ from gpu_mem_calculator.engines.base import BaseEngine
18
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
19
+
20
+
21
+ class DeepSpeedEngine(BaseEngine):
22
+ """DeepSpeed ZeRO memory calculation.
23
+
24
+ Implements ZeRO stages:
25
+ - ZeRO-1: Shard optimizer states
26
+ - ZeRO-2: Shard optimizer states + gradients
27
+ - ZeRO-3: Shard optimizer states + gradients + parameters
28
+ """
29
+
30
+ def calculate_memory(self) -> MemoryResult:
31
+ """Calculate memory requirements for DeepSpeed ZeRO training.
32
+
33
+ Returns:
34
+ MemoryResult with complete memory breakdown
35
+ """
36
+ zero_stage = self.engine_config.zero_stage or 0
37
+ offload_optimizer = self.engine_config.offload_optimizer
38
+ offload_param = self.engine_config.offload_param
39
+
40
+ # Get largest layer params for ZeRO-3
41
+ if self.model_config.largest_layer_params is None:
42
+ largest_layer_params = estimate_largest_layer_params(
43
+ hidden_size=self.model_config.hidden_size,
44
+ num_attention_heads=self.model_config.num_attention_heads,
45
+ )
46
+ else:
47
+ largest_layer_params = self.model_config.largest_layer_params
48
+
49
+ match zero_stage:
50
+ case 0:
51
+ return self._calculate_zero0()
52
+ case 1:
53
+ return self._calculate_zero1(offload_optimizer)
54
+ case 2:
55
+ return self._calculate_zero2(offload_optimizer)
56
+ case 3:
57
+ return self._calculate_zero3(
58
+ offload_optimizer,
59
+ offload_param,
60
+ largest_layer_params,
61
+ )
62
+ case _:
63
+ # Default to ZeRO-2
64
+ return self._calculate_zero2(offload_optimizer)
65
+
66
+ def _calculate_zero0(self) -> MemoryResult:
67
+ """Calculate memory for ZeRO-0 (disabled, same as PyTorch DDP)."""
68
+ # Import here to avoid circular dependency
69
+ from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
70
+
71
+ # ZeRO-0 is the same as PyTorch DDP
72
+ ddp_engine = PyTorchDDPEngine(
73
+ model_config=self.model_config,
74
+ training_config=self.training_config,
75
+ parallelism_config=self.parallelism_config,
76
+ engine_config=self.engine_config,
77
+ gpu_config=self.gpu_config,
78
+ )
79
+ return ddp_engine.calculate_memory()
80
+
81
+ def _calculate_zero1(
82
+ self,
83
+ offload_optimizer: OffloadDevice,
84
+ ) -> MemoryResult:
85
+ """Calculate memory for ZeRO-1 (shard optimizer states).
86
+
87
+ ZeRO-1 shards optimizer states across data parallel GPUs.
88
+
89
+ Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
90
+ Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
91
+
92
+ Memory formula:
93
+ - offload_optimizer=cpu: 2 * params (fp16 params only on GPU)
94
+ - offload_optimizer=none: 4 * params (fp16 params + fp32 params) +
95
+ 12 * params / num_gpus (sharded optimizer states)
96
+
97
+ Note: Optimizer states = 12 bytes per param for Adam/AdamW
98
+ - 4 bytes: FP32 parameter copy
99
+ - 4 bytes: Momentum (FP32)
100
+ - 4 bytes: Variance (FP32)
101
+ """
102
+ num_params = self.model_config.num_parameters
103
+ num_gpus = self.total_num_gpus
104
+
105
+ # Model parameters (fp16/bf16 on GPU)
106
+ model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16 = 2 bytes
107
+
108
+ # Gradients (fp16 on GPU)
109
+ gradients_gb = gb_from_bytes(num_params * 2)
110
+
111
+ # Optimizer states (sharded across GPUs, possibly offloaded to CPU)
112
+ # 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
113
+ if offload_optimizer == OffloadDevice.CPU:
114
+ # Offloaded to CPU, minimal GPU memory for optimizer
115
+ optimizer_gb = 0.0
116
+ cpu_memory_gb = gb_from_bytes(num_params * 12) # Full optimizer on CPU
117
+ else:
118
+ # Sharded across GPUs: 12 bytes / num_gpus per GPU
119
+ optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
120
+ cpu_memory_gb = 0.0
121
+
122
+ # Activations (same as baseline)
123
+ activations_gb = calculate_activation_memory(
124
+ batch_size=self.training_config.batch_size,
125
+ seq_len=self.model_config.max_seq_len,
126
+ hidden_size=self.model_config.hidden_size,
127
+ num_layers=self.model_config.num_layers,
128
+ num_attention_heads=self.model_config.num_attention_heads,
129
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
130
+ activation_checkpointing=self.training_config.activation_checkpointing,
131
+ moe_enabled=self.model_config.moe_enabled,
132
+ num_experts=self.model_config.num_experts,
133
+ top_k=self.model_config.top_k,
134
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
135
+ )
136
+
137
+ # Overhead
138
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
139
+ overhead_gb = calculate_overhead(base_memory)
140
+
141
+ breakdown = MemoryBreakdown(
142
+ model_params_gb=model_params_gb,
143
+ gradients_gb=gradients_gb,
144
+ optimizer_states_gb=optimizer_gb,
145
+ activations_gb=activations_gb,
146
+ overhead_gb=overhead_gb,
147
+ )
148
+
149
+ return self._create_result(breakdown, cpu_memory_gb)
150
+
151
+ def _calculate_zero2(
152
+ self,
153
+ offload_optimizer: OffloadDevice,
154
+ ) -> MemoryResult:
155
+ """Calculate memory for ZeRO-2 (shard optimizer + gradients).
156
+
157
+ ZeRO-2 shards optimizer states AND gradients across data parallel GPUs.
158
+
159
+ Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
160
+ Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
161
+
162
+ Memory formula:
163
+ - offload_optimizer=cpu: 2 * params (fp16 params) +
164
+ (2 * params / num_gpus) (sharded fp16 grads)
165
+ - offload_optimizer=none: 2 * params (fp16 params) +
166
+ 2 * params / num_gpus (sharded fp16 grads) +
167
+ 12 * params / num_gpus (sharded optimizer states)
168
+
169
+ Note: Unlike ZeRO-1, ZeRO-2 shards gradients across GPUs
170
+ """
171
+ num_params = self.model_config.num_parameters
172
+ num_gpus = self.total_num_gpus
173
+
174
+ # Model parameters (fp16/bf16 on GPU) - NOT sharded in ZeRO-2
175
+ model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16 = 2 bytes
176
+
177
+ # Gradients (fp16 on GPU) - SHARDED in ZeRO-2
178
+ gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
179
+
180
+ # Optimizer states (sharded across GPUs, possibly offloaded to CPU)
181
+ # 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
182
+ if offload_optimizer == OffloadDevice.CPU:
183
+ # Offloaded to CPU, minimal GPU memory for optimizer
184
+ optimizer_gb = 0.0
185
+ cpu_memory_gb = gb_from_bytes(num_params * 12) # Full optimizer on CPU
186
+ else:
187
+ # Sharded across GPUs: 12 bytes / num_gpus per GPU
188
+ optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
189
+ cpu_memory_gb = 0.0
190
+
191
+ # Activations (same as baseline)
192
+ activations_gb = calculate_activation_memory(
193
+ batch_size=self.training_config.batch_size,
194
+ seq_len=self.model_config.max_seq_len,
195
+ hidden_size=self.model_config.hidden_size,
196
+ num_layers=self.model_config.num_layers,
197
+ num_attention_heads=self.model_config.num_attention_heads,
198
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
199
+ activation_checkpointing=self.training_config.activation_checkpointing,
200
+ moe_enabled=self.model_config.moe_enabled,
201
+ num_experts=self.model_config.num_experts,
202
+ top_k=self.model_config.top_k,
203
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
204
+ )
205
+
206
+ # Overhead
207
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
208
+ overhead_gb = calculate_overhead(base_memory)
209
+
210
+ breakdown = MemoryBreakdown(
211
+ model_params_gb=model_params_gb,
212
+ gradients_gb=gradients_gb,
213
+ optimizer_states_gb=optimizer_gb,
214
+ activations_gb=activations_gb,
215
+ overhead_gb=overhead_gb,
216
+ )
217
+
218
+ return self._create_result(breakdown, cpu_memory_gb)
219
+
220
+ def _calculate_zero3(
221
+ self,
222
+ offload_optimizer: OffloadDevice,
223
+ offload_param: OffloadDevice,
224
+ largest_layer_params: int,
225
+ ) -> MemoryResult:
226
+ """Calculate memory for ZeRO-3 (shard params + optimizer + gradients).
227
+
228
+ ZeRO-3 shards everything across GPUs.
229
+
230
+ Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
231
+ Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
232
+
233
+ Memory formula:
234
+ - largest_layer_memory = 4 * largest_layer_params (fp16 params + fp16 grads)
235
+
236
+ Case 1 (no offload):
237
+ largest_layer_memory + 18 * params / num_gpus
238
+ (where 18 = 16 bytes optimizer states + 2 bytes fp16 params)
239
+
240
+ Case 2 (param + optimizer offload to CPU):
241
+ largest_layer_memory (main limit is CPU RAM)
242
+
243
+ Case 3 (optimizer offload to CPU only):
244
+ largest_layer_memory + 2 * params / num_gpus
245
+
246
+ Note: Optimizer states = 16 bytes per param for Adam/AdamW (FP32)
247
+ - 4 bytes: FP32 parameter copy
248
+ - 4 bytes: Momentum (FP32)
249
+ - 4 bytes: Variance (FP32)
250
+ - 4 bytes: Gradient (FP32 copy for optimizer update)
251
+ """
252
+ num_params = self.model_config.num_parameters
253
+ num_gpus = self.total_num_gpus
254
+
255
+ # Largest layer memory (fp16 params + fp16 grads gathered on one GPU)
256
+ largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
257
+
258
+ # Calculate memory based on offload configuration
259
+ if offload_param == OffloadDevice.CPU and offload_optimizer == OffloadDevice.CPU:
260
+ # Case 2: Both params and optimizer offloaded to CPU
261
+ # Only need largest layer on GPU at a time
262
+ params_per_gpu_gb = 0.0
263
+ gradients_per_gpu_gb = 0.0
264
+ optimizer_gb = 0.0
265
+ cpu_memory_gb = gb_from_bytes(num_params * 18) # Full model on CPU
266
+ elif offload_optimizer == OffloadDevice.CPU:
267
+ # Case 3: Only optimizer offloaded to CPU
268
+ params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
269
+ gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
270
+ optimizer_gb = 0.0
271
+ cpu_memory_gb = gb_from_bytes(num_params * 16) # Optimizer on CPU
272
+ else:
273
+ # Case 1: No offload
274
+ params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
275
+ gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
276
+ optimizer_gb = gb_from_bytes((num_params * 16) / num_gpus) # FP32
277
+ cpu_memory_gb = 0.0
278
+
279
+ # Model params = largest layer for ZeRO-3
280
+ model_params_gb = largest_layer_memory_gb
281
+
282
+ # Activations
283
+ activations_gb = calculate_activation_memory(
284
+ batch_size=self.training_config.batch_size,
285
+ seq_len=self.model_config.max_seq_len,
286
+ hidden_size=self.model_config.hidden_size,
287
+ num_layers=self.model_config.num_layers,
288
+ num_attention_heads=self.model_config.num_attention_heads,
289
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
290
+ activation_checkpointing=self.training_config.activation_checkpointing,
291
+ moe_enabled=self.model_config.moe_enabled,
292
+ num_experts=self.model_config.num_experts,
293
+ top_k=self.model_config.top_k,
294
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
295
+ )
296
+
297
+ # Overhead
298
+ base_memory = (
299
+ model_params_gb
300
+ + params_per_gpu_gb
301
+ + gradients_per_gpu_gb
302
+ + optimizer_gb
303
+ + activations_gb
304
+ )
305
+ overhead_gb = calculate_overhead(base_memory)
306
+
307
+ # For ZeRO-3, we combine params/gradients/optimizer into model_params in breakdown
308
+ breakdown = MemoryBreakdown(
309
+ model_params_gb=model_params_gb + params_per_gpu_gb,
310
+ gradients_gb=gradients_per_gpu_gb,
311
+ optimizer_states_gb=optimizer_gb,
312
+ activations_gb=activations_gb,
313
+ overhead_gb=overhead_gb,
314
+ )
315
+
316
+ return self._create_result(breakdown, cpu_memory_gb)
src/gpu_mem_calculator/engines/fsdp.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FSDP (Fully Sharded Data Parallel) engine implementation.
2
+
3
+ Implements memory calculations for PyTorch FSDP.
4
+
5
+ Reference: https://pytorch.org/docs/stable/fsdp.html
6
+ Reference: https://blog.eleuther.ai/transformer-math/
7
+ """
8
+
9
+ from gpu_mem_calculator.core.formulas import (
10
+ calculate_activation_memory,
11
+ calculate_overhead,
12
+ estimate_largest_layer_params,
13
+ )
14
+ from gpu_mem_calculator.core.models import (
15
+ MemoryBreakdown,
16
+ MemoryResult,
17
+ )
18
+ from gpu_mem_calculator.engines.base import BaseEngine
19
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
20
+
21
+
22
+ class FSDPEngine(BaseEngine):
23
+ """PyTorch FSDP memory calculation.
24
+
25
+ FSDP shards model parameters, gradients, and optimizer states
26
+ across data parallel GPUs, similar to DeepSpeed ZeRO-3.
27
+
28
+ Sharding strategies:
29
+ - NO_SHARD: Equivalent to DDP (no sharding)
30
+ - SHARD_GRAD_OP: Shard gradients and optimizer states (like ZeRO-2)
31
+ - FULL_SHARD: Shard everything (like ZeRO-3)
32
+ """
33
+
34
+ def calculate_memory(self) -> MemoryResult:
35
+ """Calculate memory requirements for FSDP training.
36
+
37
+ Returns:
38
+ MemoryResult with complete memory breakdown
39
+ """
40
+ sharding_strategy = self.engine_config.sharding_strategy
41
+
42
+ # Get largest layer params for FULL_SHARD
43
+ if self.model_config.largest_layer_params is None:
44
+ largest_layer_params = estimate_largest_layer_params(
45
+ hidden_size=self.model_config.hidden_size,
46
+ num_attention_heads=self.model_config.num_attention_heads,
47
+ )
48
+ else:
49
+ largest_layer_params = self.model_config.largest_layer_params
50
+
51
+ match sharding_strategy:
52
+ case "no_shard":
53
+ return self._calculate_no_shard()
54
+ case "shard_grad_op":
55
+ return self._calculate_shard_grad_op()
56
+ case "full_shard":
57
+ return self._calculate_full_shard(largest_layer_params)
58
+ case _:
59
+ # Default to full shard
60
+ return self._calculate_full_shard(largest_layer_params)
61
+
62
+ def _calculate_no_shard(self) -> MemoryResult:
63
+ """Calculate memory for NO_SHARD (same as DDP).
64
+
65
+ No sharding - each GPU holds a full copy of the model.
66
+ """
67
+ # Import PyTorch DDP engine
68
+ from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
69
+
70
+ ddp_engine = PyTorchDDPEngine(
71
+ model_config=self.model_config,
72
+ training_config=self.training_config,
73
+ parallelism_config=self.parallelism_config,
74
+ engine_config=self.engine_config,
75
+ gpu_config=self.gpu_config,
76
+ )
77
+ return ddp_engine.calculate_memory()
78
+
79
+ def _calculate_shard_grad_op(self) -> MemoryResult:
80
+ """Calculate memory for SHARD_GRAD_OP.
81
+
82
+ Shards gradients and optimizer states across GPUs.
83
+ Similar to DeepSpeed ZeRO-2.
84
+
85
+ Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
86
+ Reference: https://blog.eleuther.ai/transformer-math/
87
+
88
+ Memory formula:
89
+ - Model parameters: Full model on each GPU (not sharded)
90
+ - Gradients: Sharded across GPUs
91
+ - Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW)
92
+
93
+ Note: Optimizer states = 12 bytes per param for Adam/AdamW
94
+ - 4 bytes: FP32 parameter copy
95
+ - 4 bytes: Momentum (FP32)
96
+ - 4 bytes: Variance (FP32)
97
+ """
98
+ num_params = self.model_config.num_parameters
99
+ num_gpus = self.total_num_gpus
100
+
101
+ # Model parameters (full model on each GPU)
102
+ model_params_gb = gb_from_bytes(num_params * 2) # FP16/BF16
103
+
104
+ # Gradients (sharded)
105
+ gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
106
+
107
+ # Optimizer states (sharded) - 12 bytes per param for Adam/AdamW
108
+ optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus) # FP32
109
+
110
+ # Activations
111
+ activations_gb = calculate_activation_memory(
112
+ batch_size=self.training_config.batch_size,
113
+ seq_len=self.model_config.max_seq_len,
114
+ hidden_size=self.model_config.hidden_size,
115
+ num_layers=self.model_config.num_layers,
116
+ num_attention_heads=self.model_config.num_attention_heads,
117
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
118
+ activation_checkpointing=self.training_config.activation_checkpointing,
119
+ moe_enabled=self.model_config.moe_enabled,
120
+ num_experts=self.model_config.num_experts,
121
+ top_k=self.model_config.top_k,
122
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
123
+ )
124
+
125
+ # Overhead
126
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
127
+ overhead_gb = calculate_overhead(base_memory)
128
+
129
+ breakdown = MemoryBreakdown(
130
+ model_params_gb=model_params_gb,
131
+ gradients_gb=gradients_gb,
132
+ optimizer_states_gb=optimizer_gb,
133
+ activations_gb=activations_gb,
134
+ overhead_gb=overhead_gb,
135
+ )
136
+
137
+ return self._create_result(breakdown)
138
+
139
+ def _calculate_full_shard(self, largest_layer_params: int) -> MemoryResult:
140
+ """Calculate memory for FULL_SHARD.
141
+
142
+ Shards parameters, gradients, and optimizer states.
143
+ Similar to DeepSpeed ZeRO-3.
144
+
145
+ Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
146
+ Reference: https://blog.eleuther.ai/transformer-math/
147
+
148
+ Memory formula:
149
+ - Largest layer: 4 * largest_layer_params (fp16 params + fp16 grads)
150
+ - Remaining parameters and gradients: Sharded across GPUs (2 bytes fp16 each)
151
+ - Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW in FP32)
152
+
153
+ Total per GPU: largest_layer_memory + 2 * params / num_gpus +
154
+ 2 * params / num_gpus + 12 * params / num_gpus
155
+ = largest_layer_memory + 16 * params / num_gpus
156
+
157
+ Note: FSDP typically uses 12 bytes for optimizer states (not 16 like DeepSpeed ZeRO-3)
158
+ because FSDP doesn't keep an additional FP32 gradient copy in the optimizer states.
159
+ """
160
+ num_params = self.model_config.num_parameters
161
+ num_gpus = self.total_num_gpus
162
+
163
+ # Largest layer memory (fp16 params + fp16 grads gathered during compute)
164
+ largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
165
+
166
+ # Sharded parameters (fp16)
167
+ params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
168
+
169
+ # Sharded gradients (fp16)
170
+ gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
171
+
172
+ # Sharded optimizer states (FP32 for Adam/AdamW)
173
+ # 12 bytes per param: 4 bytes fp32 params copy + 4 bytes momentum + 4 bytes variance
174
+ optimizer_per_gpu_gb = gb_from_bytes((num_params * 12) / num_gpus)
175
+
176
+ # Model params in breakdown: largest layer (gathered) + sharded params
177
+ # This represents the total parameter memory on each GPU
178
+ model_params_gb = largest_layer_memory_gb + params_per_gpu_gb
179
+
180
+ # Activations
181
+ activations_gb = calculate_activation_memory(
182
+ batch_size=self.training_config.batch_size,
183
+ seq_len=self.model_config.max_seq_len,
184
+ hidden_size=self.model_config.hidden_size,
185
+ num_layers=self.model_config.num_layers,
186
+ num_attention_heads=self.model_config.num_attention_heads,
187
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
188
+ activation_checkpointing=self.training_config.activation_checkpointing,
189
+ moe_enabled=self.model_config.moe_enabled,
190
+ num_experts=self.model_config.num_experts,
191
+ top_k=self.model_config.top_k,
192
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
193
+ )
194
+
195
+ # Overhead
196
+ base_memory = (
197
+ largest_layer_memory_gb
198
+ + params_per_gpu_gb
199
+ + gradients_per_gpu_gb
200
+ + optimizer_per_gpu_gb
201
+ + activations_gb
202
+ )
203
+ overhead_gb = calculate_overhead(base_memory)
204
+
205
+ breakdown = MemoryBreakdown(
206
+ model_params_gb=model_params_gb,
207
+ gradients_gb=gradients_per_gpu_gb,
208
+ optimizer_states_gb=optimizer_per_gpu_gb,
209
+ activations_gb=activations_gb,
210
+ overhead_gb=overhead_gb,
211
+ )
212
+
213
+ return self._create_result(breakdown)
src/gpu_mem_calculator/engines/megatron.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Megatron-LM engine implementation.
2
+
3
+ Implements memory calculations for Megatron-LM with tensor, pipeline,
4
+ and sequence parallelism.
5
+
6
+ Reference: https://github.com/NVIDIA/Megatron-LM
7
+ Reference: https://arxiv.org/abs/1909.08053
8
+ Reference: https://blog.eleuther.ai/transformer-math/
9
+ """
10
+
11
+ from gpu_mem_calculator.core.formulas import (
12
+ calculate_activation_memory,
13
+ calculate_gradient_memory,
14
+ calculate_optimizer_memory,
15
+ calculate_overhead,
16
+ calculate_parameter_memory,
17
+ )
18
+ from gpu_mem_calculator.core.models import (
19
+ MemoryBreakdown,
20
+ MemoryResult,
21
+ )
22
+ from gpu_mem_calculator.engines.base import BaseEngine
23
+ from gpu_mem_calculator.utils.precision import gb_from_bytes
24
+
25
+
26
+ class MegatronLMEngine(BaseEngine):
27
+ """Megatron-LM memory calculation.
28
+
29
+ Megatron-LM uses tensor parallelism to split individual layers across GPUs,
30
+ and optionally pipeline parallelism to split layers across GPUs.
31
+ """
32
+
33
+ def calculate_memory(self) -> MemoryResult:
34
+ """Calculate memory requirements for Megatron-LM training.
35
+
36
+ Megatron-LM memory characteristics:
37
+ - Parameters are sharded across tensor parallel GPUs
38
+ - Gradients are sharded across tensor parallel GPUs
39
+ - Optimizer states can be sharded or replicated
40
+ - Activations depend on tensor/pipeline/sequence parallelism
41
+
42
+ Returns:
43
+ MemoryResult with complete memory breakdown
44
+ """
45
+ tp_size = self.parallelism_config.tensor_parallel_size
46
+ pp_size = self.parallelism_config.pipeline_parallel_size
47
+ seq_parallel = self.parallelism_config.sequence_parallel
48
+
49
+ # 1. Model parameters (sharded by tensor parallelism)
50
+ # Each TP GPU holds 1/tp of the parameters
51
+ params_per_gpu = self.model_config.num_parameters / tp_size
52
+ model_params_gb = calculate_parameter_memory(
53
+ num_params=int(params_per_gpu),
54
+ dtype=self.training_config.dtype.value,
55
+ )
56
+
57
+ # 2. Gradients (sharded by tensor parallelism)
58
+ gradients_gb = calculate_gradient_memory(
59
+ num_params=int(params_per_gpu),
60
+ dtype=self.training_config.dtype.value,
61
+ )
62
+
63
+ # 3. Optimizer states
64
+ # In Megatron-LM, optimizer states are typically sharded similarly to parameters
65
+ # for tensor parallelism, but this can vary based on configuration
66
+ optimizer_gb = calculate_optimizer_memory(
67
+ num_params=int(params_per_gpu),
68
+ optimizer=self.training_config.optimizer.value,
69
+ )
70
+
71
+ # 4. Activations
72
+ # Activations are affected by:
73
+ # - Tensor parallelism: splits activations across TP GPUs
74
+ # - Pipeline parallelism: only holds activations for current stage
75
+ # - Sequence parallelism: splits sequence dimension
76
+ activations_gb = self._calculate_megatron_activations(
77
+ tp_size=tp_size,
78
+ pp_size=pp_size,
79
+ seq_parallel=seq_parallel,
80
+ )
81
+
82
+ # 5. Overhead
83
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
84
+ overhead_gb = calculate_overhead(base_memory)
85
+
86
+ breakdown = MemoryBreakdown(
87
+ model_params_gb=model_params_gb,
88
+ gradients_gb=gradients_gb,
89
+ optimizer_states_gb=optimizer_gb,
90
+ activations_gb=activations_gb,
91
+ overhead_gb=overhead_gb,
92
+ )
93
+
94
+ return self._create_result(breakdown)
95
+
96
+ def _calculate_megatron_activations(
97
+ self,
98
+ tp_size: int,
99
+ pp_size: int,
100
+ seq_parallel: bool,
101
+ ) -> float:
102
+ """Calculate activation memory for Megatron-LM.
103
+
104
+ Megatron-LM activations are affected by parallelism strategy:
105
+ - Tensor parallelism: splits hidden dimension
106
+ - Pipeline parallelism: only current stage's activations
107
+ - Sequence parallelism: splits sequence dimension
108
+
109
+ Args:
110
+ tp_size: Tensor parallelism size
111
+ pp_size: Pipeline parallelism size
112
+ seq_parallel: Whether sequence parallelism is enabled
113
+
114
+ Returns:
115
+ Activation memory in GB
116
+ """
117
+
118
+ # Base activation memory
119
+ base_activations = calculate_activation_memory(
120
+ batch_size=self.training_config.batch_size,
121
+ seq_len=self.model_config.max_seq_len,
122
+ hidden_size=self.model_config.hidden_size,
123
+ num_layers=self.model_config.num_layers,
124
+ num_attention_heads=self.model_config.num_attention_heads,
125
+ tensor_parallel_size=tp_size,
126
+ activation_checkpointing=self.training_config.activation_checkpointing,
127
+ moe_enabled=self.model_config.moe_enabled,
128
+ num_experts=self.model_config.num_experts,
129
+ top_k=self.model_config.top_k,
130
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
131
+ )
132
+
133
+ # Adjust for pipeline parallelism
134
+ # Each PP stage only holds num_layers / pp_size layers
135
+ pp_factor = 1.0 / pp_size
136
+
137
+ # Adjust for sequence parallelism
138
+ # If enabled, splits sequence dimension across TP GPUs
139
+ if seq_parallel and tp_size > 1:
140
+ seq_factor = 1.0 / tp_size
141
+ else:
142
+ seq_factor = 1.0
143
+
144
+ return base_activations * pp_factor * seq_factor
145
+
146
+
147
+ class MegatronDeepSpeedEngine(BaseEngine):
148
+ """Megatron-LM + DeepSpeed combined engine.
149
+
150
+ This combines Megatron-LM's tensor/pipeline parallelism with
151
+ DeepSpeed ZeRO's optimizer/gradient sharding.
152
+ """
153
+
154
+ def calculate_memory(self) -> MemoryResult:
155
+ """Calculate memory for Megatron-LM + DeepSpeed.
156
+
157
+ This uses:
158
+ - Megatron-LM for tensor/pipeline parallelism and activation memory
159
+ - DeepSpeed ZeRO for optimizer/gradient sharding
160
+
161
+ Returns:
162
+ MemoryResult with complete memory breakdown
163
+ """
164
+ # Import DeepSpeed engine
165
+
166
+ # First calculate activation memory using Megatron-LM approach
167
+ tp_size = self.parallelism_config.tensor_parallel_size
168
+ pp_size = self.parallelism_config.pipeline_parallel_size
169
+ seq_parallel = self.parallelism_config.sequence_parallel
170
+
171
+ activations_gb = self._calculate_megatron_activations(
172
+ tp_size=tp_size,
173
+ pp_size=pp_size,
174
+ seq_parallel=seq_parallel,
175
+ )
176
+
177
+ # For parameters, gradients, optimizer - use DeepSpeed ZeRO logic
178
+ # But account for tensor parallelism (parameters are already split by TP)
179
+ tp_size = self.parallelism_config.tensor_parallel_size
180
+ params_per_gpu = self.model_config.num_parameters / tp_size
181
+
182
+ zero_stage = self.engine_config.zero_stage or 2
183
+ offload_optimizer = self.engine_config.offload_optimizer
184
+
185
+ # Model parameters (sharded by TP, then possibly by ZeRO)
186
+ if zero_stage >= 3:
187
+ # ZeRO-3 shards further
188
+ dp_size = self.parallelism_config.data_parallel_size
189
+ model_params_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
190
+ else:
191
+ # ZeRO-0/1/2 keeps parameters on each TP GPU
192
+ model_params_gb = gb_from_bytes(params_per_gpu * 2)
193
+
194
+ # Gradients
195
+ if zero_stage >= 2:
196
+ dp_size = self.parallelism_config.data_parallel_size
197
+ gradients_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
198
+ else:
199
+ gradients_gb = gb_from_bytes(params_per_gpu * 2)
200
+
201
+ # Optimizer states (12 bytes per param for Adam/AdamW in FP32)
202
+ if offload_optimizer.value == "cpu":
203
+ optimizer_gb = 0.0
204
+ else:
205
+ if zero_stage >= 1:
206
+ dp_size = self.parallelism_config.data_parallel_size
207
+ optimizer_gb = gb_from_bytes((params_per_gpu * 12) / dp_size)
208
+ else:
209
+ optimizer_gb = gb_from_bytes(params_per_gpu * 12)
210
+
211
+ # Overhead
212
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
213
+ overhead_gb = gb_from_bytes(base_memory * 0.2)
214
+
215
+ breakdown = MemoryBreakdown(
216
+ model_params_gb=model_params_gb,
217
+ gradients_gb=gradients_gb,
218
+ optimizer_states_gb=optimizer_gb,
219
+ activations_gb=activations_gb,
220
+ overhead_gb=overhead_gb,
221
+ )
222
+
223
+ return self._create_result(breakdown)
224
+
225
+ def _calculate_megatron_activations(
226
+ self,
227
+ tp_size: int,
228
+ pp_size: int,
229
+ seq_parallel: bool,
230
+ ) -> float:
231
+ """Calculate activation memory for Megatron-LM."""
232
+
233
+ # Base activation memory
234
+ base_activations = calculate_activation_memory(
235
+ batch_size=self.training_config.batch_size,
236
+ seq_len=self.model_config.max_seq_len,
237
+ hidden_size=self.model_config.hidden_size,
238
+ num_layers=self.model_config.num_layers,
239
+ num_attention_heads=self.model_config.num_attention_heads,
240
+ tensor_parallel_size=tp_size,
241
+ activation_checkpointing=self.training_config.activation_checkpointing,
242
+ moe_enabled=self.model_config.moe_enabled,
243
+ num_experts=self.model_config.num_experts,
244
+ top_k=self.model_config.top_k,
245
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
246
+ )
247
+
248
+ # Adjust for pipeline parallelism
249
+ pp_factor = 1.0 / pp_size
250
+
251
+ # Adjust for sequence parallelism
252
+ if seq_parallel and tp_size > 1:
253
+ seq_factor = 1.0 / tp_size
254
+ else:
255
+ seq_factor = 1.0
256
+
257
+ return base_activations * pp_factor * seq_factor
src/gpu_mem_calculator/engines/pytorch.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PyTorch DDP (Distributed Data Parallel) engine implementation.
2
+
3
+ This is the baseline implementation without any memory optimizations.
4
+
5
+ Reference: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
6
+ Reference: https://blog.eleuther.ai/transformer-math/
7
+ """
8
+
9
+ from gpu_mem_calculator.core.formulas import (
10
+ calculate_activation_memory,
11
+ calculate_gradient_memory,
12
+ calculate_optimizer_memory,
13
+ calculate_overhead,
14
+ calculate_parameter_memory,
15
+ )
16
+ from gpu_mem_calculator.core.models import (
17
+ MemoryBreakdown,
18
+ MemoryResult,
19
+ )
20
+ from gpu_mem_calculator.engines.base import BaseEngine
21
+
22
+
23
+ class PyTorchDDPEngine(BaseEngine):
24
+ """PyTorch DDP memory calculation.
25
+
26
+ DDP replicates the model on each GPU, so memory is not sharded.
27
+ Each GPU holds a full copy of the model, gradients, and optimizer states.
28
+ """
29
+
30
+ def calculate_memory(self) -> MemoryResult:
31
+ """Calculate memory requirements for PyTorch DDP training.
32
+
33
+ For DDP:
34
+ - Model parameters: Full model on each GPU
35
+ - Gradients: Full gradients on each GPU
36
+ - Optimizer states: Full optimizer states on each GPU (FP32)
37
+ - Activations: Batch size dependent, split by data parallel
38
+
39
+ Returns:
40
+ MemoryResult with complete memory breakdown
41
+ """
42
+ # 1. Model parameters (in the specified dtype)
43
+ model_params_gb = calculate_parameter_memory(
44
+ num_params=self.model_config.num_parameters,
45
+ dtype=self.training_config.dtype.value,
46
+ )
47
+
48
+ # 2. Gradients (same precision as parameters for mixed precision)
49
+ gradients_gb = calculate_gradient_memory(
50
+ num_params=self.model_config.num_parameters,
51
+ dtype=self.training_config.dtype.value,
52
+ )
53
+
54
+ # 3. Optimizer states (always FP32 for Adam/AdamW)
55
+ optimizer_gb = calculate_optimizer_memory(
56
+ num_params=self.model_config.num_parameters,
57
+ optimizer=self.training_config.optimizer.value,
58
+ )
59
+
60
+ # 4. Activations (depends on batch size and model architecture)
61
+ activations_gb = calculate_activation_memory(
62
+ batch_size=self.training_config.batch_size,
63
+ seq_len=self.model_config.max_seq_len,
64
+ hidden_size=self.model_config.hidden_size,
65
+ num_layers=self.model_config.num_layers,
66
+ num_attention_heads=self.model_config.num_attention_heads,
67
+ tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
68
+ activation_checkpointing=self.training_config.activation_checkpointing,
69
+ moe_enabled=self.model_config.moe_enabled,
70
+ num_experts=self.model_config.num_experts,
71
+ top_k=self.model_config.top_k,
72
+ expert_intermediate_size=self.model_config.expert_intermediate_size,
73
+ )
74
+
75
+ # 5. Calculate overhead
76
+ base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
77
+ overhead_gb = calculate_overhead(base_memory)
78
+
79
+ # Create breakdown
80
+ breakdown = MemoryBreakdown(
81
+ model_params_gb=model_params_gb,
82
+ gradients_gb=gradients_gb,
83
+ optimizer_states_gb=optimizer_gb,
84
+ activations_gb=activations_gb,
85
+ overhead_gb=overhead_gb,
86
+ )
87
+
88
+ return self._create_result(breakdown)
src/gpu_mem_calculator/exporters/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Framework configuration exporters."""
2
+
3
+ from gpu_mem_calculator.exporters.accelerate import AccelerateExporter
4
+ from gpu_mem_calculator.exporters.axolotl import AxolotlExporter
5
+ from gpu_mem_calculator.exporters.lightning import LightningExporter
6
+ from gpu_mem_calculator.exporters.manager import ExportFormat, ExportManager
7
+
8
+ __all__ = [
9
+ "ExportManager",
10
+ "ExportFormat",
11
+ "AccelerateExporter",
12
+ "LightningExporter",
13
+ "AxolotlExporter",
14
+ ]
src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (628 Bytes). View file
 
src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc ADDED
Binary file (7.81 kB). View file
 
src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc ADDED
Binary file (9.07 kB). View file
 
src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc ADDED
Binary file (9.41 kB). View file