diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..50bb14c4db1c10f694e812d22ef59929e302dece
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,92 @@
+# Git
+.git
+.gitignore
+.github
+
+# Docker
+Dockerfile
+.dockerignore
+
+# Python
+__pycache__
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+venv/
+env/
+ENV/
+.venv/
+.env
+
+# Testing
+.pytest_cache/
+.coverage
+coverage.xml
+htmlcov/
+.tox/
+.mypy_cache/
+.ruff_cache/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Claude
+.claude/
+.mcp.json
+
+# Documentation (source files included, but skip extras)
+docs/
+*.md
+!README.md
+
+# Project specific
+*.log
+.env
+.venv/
+
+# CI/CD
+CODE_OF_CONDUCT.md
+CONTRIBUTING.md
+MARKETING.md
+SECURITY.md
+CHANGELOG.md
+
+# Screenshots and images
+*.png
+*.jpg
+*.jpeg
+*.gif
+!screenshot.png
+
+# Test files
+tests/
+examples/
+configs/
+
+# MCP server config
+.mcp.json
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9b839e9bebc66a3498e285538b5bd85051a4e045
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,40 @@
+# Dockerfile for Hugging Face Spaces
+# GPU Memory Calculator - FastAPI Web Application
+
+FROM python:3.12-slim
+
+# Set working directory
+WORKDIR /app
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PORT=7860
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        gcc \
+        && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy project files
+COPY . .
+
+# Install the package in editable mode
+RUN pip install --no-cache-dir -e .
+
+# Expose Hugging Face Spaces default port
+EXPOSE 7860
+
+# Health check endpoint
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/').read()"
+
+# Run the FastAPI application with uvicorn
+CMD ["uvicorn", "web.app:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/README.md b/README.md
index 902ce660e11c35c768d6103bffd24cf545f2c7a8..82a7ba4f4fb18a6d1663fe79c9a425d64051c99a 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,62 @@
 ---
-title: Gpu Memory Calculator
-emoji: 😻
-colorFrom: gray
-colorTo: yellow
+title: GPU Memory Calculator
+emoji: 🎮
+colorFrom: blue
+colorTo: purple
 sdk: docker
 pinned: false
-license: apache-2.0
-short_description: Calculates GPU memory for training, inference, and more
+license: mit
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# GPU Memory Calculator
+
+Calculate GPU memory requirements for training and running Large Language Models (LLMs). Supports multiple training engines (PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, FSDP), inference engines (HuggingFace, vLLM, TGI, TensorRT-LLM, SGLang), and multi-node training configurations.
+
+## Features
+
+- **Training Memory Calculation**: Calculate memory for PyTorch DDP, DeepSpeed ZeRO (0-3), Megatron-LM, FSDP, and hybrid approaches
+- **Inference Memory Calculation**: Estimate memory requirements for HuggingFace Transformers, vLLM, TGI, TensorRT-LLM, and SGLang
+- **Multi-Node Support**: Calculate network overhead for distributed training across multiple nodes
+- **Model Presets**: Pre-configured settings for popular models (LLaMA 2, GPT-3, Mixtral, GLM, Qwen, DeepSeek-MoE)
+- **Configuration Export**: Generate configs for Accelerate, Lightning, Axolotl, DeepSpeed, YAML, and JSON
+- **Batch Size Optimization**: Automatically find the maximum batch size that fits in GPU memory
+
+## Supported Training Engines
+
+- PyTorch DDP (Distributed Data Parallel)
+- DeepSpeed ZeRO (Stages 0-3) with CPU/NVMe offloading
+- Megatron-LM (Tensor + Pipeline Parallelism)
+- PyTorch FSDP (Fully Sharded Data Parallel)
+- Megatron-LM + DeepSpeed (Hybrid)
+
+## Supported Inference Engines
+
+- HuggingFace Transformers
+- vLLM (PagedAttention)
+- Text Generation Inference (TGI)
+- TensorRT-LLM
+- SGLang (RadixAttention)
+
+## How to Use
+
+1. **Select a preset model** or configure your own
+2. **Choose training/inference engine** and adjust parameters
+3. **Calculate** memory requirements instantly
+4. **Export** configurations to your preferred framework
+
+## Example Use Cases
+
+- Planning GPU requirements for LLM training
+- Optimizing batch sizes for your hardware
+- Comparing memory efficiency across engines
+- Estimating KV cache memory for inference
+- Calculating multi-node network overhead
+
+## Links
+
+- [GitHub Repository](https://github.com/George614/gpu-mem-calculator)
+- [Documentation](https://github.com/George614/gpu-mem-calculator/blob/main/README.md)
+
+## License
+
+MIT License - see [LICENSE](https://github.com/George614/gpu-mem-calculator/blob/main/LICENSE) for details.
diff --git a/cli/main.py b/cli/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd35d265a2e992d473bdafdff54fa5c1fba7b5c9
--- /dev/null
+++ b/cli/main.py
@@ -0,0 +1,399 @@
+"""CLI interface for GPU Memory Calculator."""
+
+import json
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+import click
+
+if TYPE_CHECKING:
+    from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+    from gpu_mem_calculator.core.models import MemoryResult
+
+
+@click.group()
+@click.version_option(version="0.1.0")
+def main() -> None:
+    """GPU Memory Calculator for LLM Training.
+
+    Calculate GPU memory requirements for training Large Language Models
+    with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
+    """
+    pass
+
+
+@main.command()
+@click.option(
+    "--config",
+    "-c",
+    type=click.Path(exists=True),
+    help="Path to JSON configuration file",
+)
+@click.option(
+    "--preset",
+    "-p",
+    type=str,
+    help="Name of a preset model configuration",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(),
+    help="Output file path (default: stdout)",
+)
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["json", "yaml", "table"]),
+    default="table",
+    help="Output format (default: table)",
+)
+def calculate(
+    config: str | None,
+    preset: str | None,
+    output: str | None,
+    format: Literal["json", "yaml", "table"],
+) -> None:
+    """Calculate GPU memory requirements from config file or preset.
+
+    Examples:
+        gpu-mem-calc calculate --config configs/llama2_7b.json
+        gpu-mem-calc calculate --preset llama2-7b
+        gpu-mem-calc calculate -p mixtral-8x7b --format json
+    """
+    if not config and not preset:
+        click.echo("Error: Either --config or --preset is required", err=True)
+        sys.exit(1)
+
+    if config and preset:
+        click.echo("Error: Cannot use both --config and --preset", err=True)
+        sys.exit(1)
+
+    try:
+        import tempfile
+
+        from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+
+        if preset:
+            # Load preset configuration
+            from gpu_mem_calculator.config.presets import get_preset_config
+
+            preset_config = get_preset_config(preset)
+            if preset_config is None:
+                click.echo(
+                    f"Error: Preset '{preset}' not found. "
+                    "Use 'gpu-mem-calc presets' to list available presets.",
+                    err=True,
+                )
+                sys.exit(1)
+
+            # Write preset to temp file for from_config_file
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+                json.dump(preset_config, f, indent=2)
+                temp_path = f.name
+
+            calculator = GPUMemoryCalculator.from_config_file(temp_path)
+            Path(temp_path).unlink()  # Clean up temp file
+        elif config:
+            calculator = GPUMemoryCalculator.from_config_file(config)
+        else:
+            # This should never happen due to the checks above
+            click.echo("Error: Either --config or --preset is required", err=True)
+            sys.exit(1)
+
+        result = calculator.calculate()
+
+        # Format output
+        if format == "json":
+            output_text = json.dumps(result.model_dump(mode="json"), indent=2)
+        elif format == "yaml":
+            try:
+                import yaml  # type: ignore[import-untyped]
+
+                output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
+            except ImportError:
+                click.echo(
+                    "Error: YAML format requires PyYAML. Install with: pip install pyyaml",
+                    err=True,
+                )
+                sys.exit(1)
+        else:  # table
+            output_text = _format_result_as_table(result, calculator)
+
+        # Write output
+        if output:
+            Path(output).write_text(output_text)
+            click.echo(f"Results written to {output}")
+        else:
+            click.echo(output_text)
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+@main.command()
+@click.argument(
+    "params",
+    type=float,
+    required=True,
+)
+@click.option(
+    "--gpus",
+    "-g",
+    type=int,
+    default=1,
+    help="Number of GPUs (default: 1)",
+)
+@click.option(
+    "--gpu-mem",
+    "-m",
+    type=float,
+    default=80.0,
+    help="GPU memory in GB (default: 80.0)",
+)
+@click.option(
+    "--engine",
+    "-e",
+    type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
+    default="pytorch",
+    help="Training engine (default: pytorch)",
+)
+@click.option(
+    "--dtype",
+    "-d",
+    type=click.Choice(["fp32", "fp16", "bf16"]),
+    default="bf16",
+    help="Data type (default: bf16)",
+)
+def quick(
+    params: float,
+    gpus: int,
+    gpu_mem: float,
+    engine: str,
+    dtype: str,
+) -> None:
+    """Quick calculation from model size (in billions of parameters).
+
+    Example:
+        gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
+    """
+    try:
+        from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+        from gpu_mem_calculator.core.models import (
+            DType,
+            EngineConfig,
+            EngineType,
+            GPUConfig,
+            ModelConfig,
+            ParallelismConfig,
+            TrainingConfig,
+        )
+
+        # Map engine string to EngineType
+        engine_map = {
+            "pytorch": EngineType.PYTORCH_DDP,
+            "deepspeed": EngineType.DEEPSPEED,
+            "megatron": EngineType.MEGATRON_LM,
+            "fsdp": EngineType.FSDP,
+        }
+
+        # Map dtype string to DType
+        dtype_map = {
+            "fp32": DType.FP32,
+            "fp16": DType.FP16,
+            "bf16": DType.BF16,
+        }
+
+        # Create a minimal config for quick calculation
+        # Estimate model architecture from parameter count
+        # Rough approximation based on typical transformer models
+        num_params = int(params * 1e9)
+
+        # Estimate hidden size and layers from param count
+        # These are rough approximations
+        if params <= 1:
+            hidden_size, num_layers = 768, 12
+        elif params <= 7:
+            hidden_size, num_layers = 4096, 32
+        elif params <= 13:
+            hidden_size, num_layers = 5120, 40
+        elif params <= 30:
+            hidden_size, num_layers = 6656, 60
+        elif params <= 65:
+            hidden_size, num_layers = 8192, 80
+        else:
+            hidden_size, num_layers = 12288, 96
+
+        model_config = ModelConfig(
+            name="quick-estimate",
+            num_parameters=num_params,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=hidden_size // 128,
+            vocab_size=32000,
+            max_seq_len=2048,
+        )
+
+        training_config = TrainingConfig(
+            batch_size=1,
+            gradient_accumulation_steps=1,
+            dtype=dtype_map[dtype],
+        )
+
+        parallelism_config = ParallelismConfig(data_parallel_size=gpus)
+
+        engine_config = EngineConfig(
+            type=engine_map[engine],
+            zero_stage=2 if engine == "deepspeed" else None,
+        )
+
+        gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
+
+        calculator = GPUMemoryCalculator(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            gpu_config=gpu_config,
+        )
+
+        result = calculator.calculate()
+
+        # Display results
+        click.echo(_format_result_as_table(result, calculator))
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+@main.command()
+@click.argument(
+    "config_path",
+    type=click.Path(exists=True),
+)
+def validate(config_path: str) -> None:
+    """Validate a configuration file.
+
+    Example:
+        gpu-mem-calc validate configs/my_config.json
+    """
+    try:
+        from gpu_mem_calculator.config import ConfigParser
+
+        ConfigParser.parse_full_config(config_path)
+        click.echo(f"✓ Configuration file '{config_path}' is valid")
+
+    except Exception as e:
+        click.echo(f"✗ Validation failed: {e}", err=True)
+        sys.exit(1)
+
+
+@main.command()
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["list", "json", "table"]),
+    default="list",
+    help="Output format (default: list)",
+)
+def presets(format: str) -> None:
+    """List available model preset configurations.
+
+    Examples:
+        gpu-mem-calc presets
+        gpu-mem-calc presets --format table
+        gpu-mem-calc presets -f json
+    """
+    try:
+        from gpu_mem_calculator.config.presets import list_presets
+
+        all_presets = list_presets()
+
+        if not all_presets:
+            click.echo("No presets found.")
+            return
+
+        if format == "json":
+            click.echo(json.dumps(all_presets, indent=2))
+        elif format == "table":
+            from rich.console import Console
+            from rich.table import Table
+
+            console = Console()
+            table = Table(
+                title="Available Model Presets",
+                show_header=True,
+                header_style="bold magenta",
+            )
+            table.add_column("Preset Name", style="cyan", width=25)
+            table.add_column("Display Name", style="green", width=30)
+            table.add_column("Description", style="yellow")
+
+            for name, info in sorted(all_presets.items()):
+                table.add_row(name, info["display_name"], info["description"])
+
+            console.print(table)
+        else:  # list format
+            click.echo("Available model presets:\n")
+            for name, info in sorted(all_presets.items()):  # type: ignore[annotation-unchecked]
+                click.echo(f"  {name:25} - {info['display_name']}")
+                if info.get("description"):
+                    click.echo(f"{'':27}{info['description']}")
+                click.echo()
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
+    """Format result as ASCII table."""
+    from rich.console import Console
+    from rich.table import Table
+
+    console = Console()
+
+    # Main results table
+    table = Table(
+        title="GPU Memory Calculation Results",
+        show_header=True,
+        header_style="bold magenta",
+    )
+    table.add_column("Metric", style="cyan", width=30)
+    table.add_column("Value", style="green")
+
+    # Memory results
+    table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
+    table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
+    table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
+    table.add_row("", "")  # Spacer
+
+    # Breakdown
+    table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
+    table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
+    table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
+    table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
+    table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
+    table.add_row("", "")  # Spacer
+
+    # Feasibility
+    status = "✓ Fits" if result.fits_on_gpu else "✗ OOM"
+    table.add_row("Status", status)
+    table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
+    if result.recommended_batch_size:
+        table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
+
+    # Capture table output
+    from io import StringIO
+
+    buffer = StringIO()
+    console.file = buffer
+    console.print(table)
+    return buffer.getvalue()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c5dce0051d91981005d0a3243345fb0c374ef1a1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+# GPU Memory Calculator - Requirements for Hugging Face Spaces
+
+# Core dependencies
+pydantic>=2.0.0
+click>=8.1.0
+pydantic-settings>=2.0.0
+rich>=13.0.0
+
+# Web dependencies
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+jinja2>=3.1.0
diff --git a/src/gpu_mem_calculator.egg-info/PKG-INFO b/src/gpu_mem_calculator.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..f688d67e80ae6860491c9404be9970393de206cf
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/PKG-INFO
@@ -0,0 +1,720 @@
+Metadata-Version: 2.4
+Name: gpu-mem-calculator
+Version: 0.1.0
+Summary: GPU Memory Calculator for LLM Training
+Author: GPU Mem Calculator Team
+License: MIT
+Project-URL: Homepage, https://github.com/George614/gpu-mem-calculator
+Project-URL: Repository, https://github.com/George614/gpu-mem-calculator
+Project-URL: Issues, https://github.com/George614/gpu-mem-calculator/issues
+Keywords: gpu,memory,calculator,llm,large-language-model,training,deepspeed,megatron,pytorch,fsdp,transformer,machine-learning,deep-learning,distributed-training,zero-optimization
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: click>=8.1.0
+Requires-Dist: pydantic-settings>=2.0.0
+Requires-Dist: rich>=13.0.0
+Provides-Extra: web
+Requires-Dist: fastapi>=0.100.0; extra == "web"
+Requires-Dist: uvicorn[standard]>=0.23.0; extra == "web"
+Requires-Dist: jinja2>=3.1.0; extra == "web"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: mypy>=1.5.0; extra == "dev"
+Dynamic: license-file
+
+# GPU Memory Calculator for LLM Training
+
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
+
+A versatile Python application for calculating GPU memory requirements for training Large Language Models with support for multiple training engines including PyTorch DDP, DeepSpeed ZeRO, Megatron-LM, and FSDP.
+
+📖 **[Getting Started Guide](docs/GETTING_STARTED.md)** | 💬 **[FAQ](docs/FAQ.md)** | 🤝 **[Contributing](CONTRIBUTING.md)**
+
+<p align="center">
+  <img src="screenshot.png" alt="GPU Memory Calculator Screenshot" width="800">
+</p>
+
+## 🚀 Why Use This Tool?
+
+Training large language models requires careful memory planning. This calculator helps you:
+
+- **💰 Save costs** by determining the optimal GPU configuration before you start training
+- **⚡ Avoid OOM errors** by validating your training configuration fits in GPU memory
+- **📊 Compare strategies** across different training engines (DeepSpeed, Megatron, FSDP)
+- **🎯 Plan infrastructure** by knowing exactly how many GPUs you need
+- **📈 Scale efficiently** with detailed memory breakdowns for optimization
+
+Whether you're training a 7B parameter model on a single GPU or a 175B model across hundreds of GPUs, this tool provides accurate memory estimates based on proven formulas from DeepSpeed, Megatron-LM, and the latest research.
+
+## ✨ Features
+
+### Core Training Calculation
+- 🔧 **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 1-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
+- 🖥️ **Dual Interface**: Both CLI and Web UI for flexible usage
+- 🎯 **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, etc.)
+- 📊 **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
+- ✅ **Feasibility Analysis**: Check if your configuration fits on available GPU memory
+- ⚙️ **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
+
+### 🆕 Inference Memory Calculation
+- 🚀 **Multi-Engine Support**: HuggingFace Transformers, vLLM, TGI, TensorRT-LLM
+- 💾 **KV Cache Optimization**: Quantization options (NONE, INT8, FP8, INT4)
+- 🔄 **Tensor Parallelism**: Automatic memory distribution across GPUs
+- 📈 **Throughput Estimation**: Tokens/second estimates for capacity planning
+- 🎯 **Batch Size Optimization**: Find maximum batch size for your hardware
+
+### 🆕 Multi-Node Training
+- 🌐 **Network Overhead Calculation**: AllReduce, AllGather, ReduceScatter, pipeline communication
+- 📡 **Interconnect Support**: InfiniBand, NVLink, Ethernet (10G/25G/100G/200G)
+- ⚡ **Hybrid Parallelism Optimization**: Automatic TP+PP+DP strategy optimization
+- 🔧 **ZeRO Stage Impact Analysis**: Compare communication overhead across ZeRO stages
+
+### 🆕 Framework Configuration Exporters
+- 📦 **Accelerate Export**: HuggingFace Accelerate config generation
+- ⚡ **Lightning Export**: PyTorch Lightning Trainer configuration
+- 🔥 **Axolotl Export**: YAML config for fine-tuning
+- 📄 **File Export**: Save to YAML/JSON formats
+- 🎛️ **Format Conversion**: Convert between different framework configs
+
+## 📦 Installation
+
+### Quick Start
+
+### Core Capabilities
+- **Multiple Training Engines**: Support for PyTorch DDP, DeepSpeed ZeRO (stages 0-3), Megatron-LM, Megatron+DeepSpeed, and PyTorch FSDP
+- **Dual Interface**: Both CLI and Web UI for flexible usage
+- **Preset Models**: Quick-load configurations for popular models (LLaMA 2, GPT-3, GLM, Mixtral, etc.)
+- **Detailed Breakdown**: Memory breakdown by component (parameters, gradients, optimizer states, activations)
+- **Feasibility Analysis**: Check if your configuration fits on available GPU memory
+- **Easy Config**: JSON-based configuration files with human-readable parameter formats (e.g., "7B", "7000M")
+
+### Web UI Enhancements
+- **Formula Explanations**: See exactly how memory is calculated with your values plugged in
+- **Real-time Validation**: Client-side validation prevents invalid configurations
+- **Smart Auto-calculation**: Optimized debouncing (1s) with minimum interval protection
+- **Export Capabilities**: Export to DeepSpeed config files, JSON, or copy to clipboard
+- **Batch Size Optimizer**: Automatically find maximum batch size that fits
+- **Comparison Mode**: Save and compare different configurations side-by-side
+- **Accessibility Features**: ARIA labels, keyboard navigation, colorblind-friendly charts
+
+### Advanced Features
+- **MoE Support**: Mixture of Experts models with configurable experts and top-k routing
+- **CPU/NVMe Offloading**: Offload optimizer states and parameters to CPU or NVMe storage
+- **Activation Checkpointing**: 5 levels from none to full checkpointing
+- **Sequence Parallelism**: Optimize memory for long sequences
+- **Result Caching**: Fast repeated calculations with built-in caching
+
+```bash
+pip install git+https://github.com/George614/gpu-mem-calculator.git
+```
+
+### From source
+
+```bash
+git clone https://github.com/George614/gpu-mem_calculator.git
+cd gpu_mem_calculator
+pip install -e .
+```
+
+### For Web UI support
+
+```bash
+pip install -e ".[web]"
+```
+
+### Development installation
+
+```bash
+pip install -e ".[dev]"
+```
+
+## 🎓 Use Cases
+
+### Research & Academia
+- Estimate GPU requirements for research projects before requesting compute resources
+- Plan multi-GPU training configurations for large-scale experiments
+- Compare memory efficiency of different training strategies
+
+### Industry & Production
+- Cost optimization: Choose the right GPU type and count for your training workload
+- Capacity planning: Forecast infrastructure needs for model development
+- Debugging: Diagnose OOM errors and optimize memory usage
+
+### Education & Learning
+- Understand how training configuration affects memory consumption
+- Learn about different distributed training strategies
+- Experiment with various optimization techniques safely
+
+## 🚀 Usage
+
+### Command Line Interface
+
+#### Using model presets (Recommended)
+
+The calculator includes pre-configured model presets for popular LLMs:
+
+```bash
+# List all available presets
+gpu-mem-calc presets
+
+# Calculate with a preset
+gpu-mem-calc calculate --preset llama2-7b
+gpu-mem-calc calculate --preset mixtral-8x7b --format json
+
+# List presets in table format
+gpu-mem-calc presets --format table
+```
+
+Available presets include:
+- **Dense Models**: LLaMA 2 (7B, 13B, 70B), GPT-3 (175B)
+- **MoE Models**: Mixtral 8x7B, GLM-4 (9B), GLM-4.7 (355B), GLM-4.5 Air (106B),
+  Qwen1.5-MoE-A2.7B, DeepSeek-MoE (16B)
+
+#### Calculate from config file
+
+```bash
+gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
+```
+
+#### Quick calculation from model size
+
+```bash
+# Calculate memory for 7B model with 8x80GB GPUs using DeepSpeed
+gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
+
+# With custom GPU memory
+gpu-mem-calc quick 70 --gpus 64 --gpu-mem 80 --engine megatron
+```
+
+#### Validate configuration
+
+```bash
+gpu-mem-calc validate configs/my_config.json
+```
+
+### Web Interface
+
+Start the web server:
+
+```bash
+python -m gpu_mem_calculator.web.app
+```
+
+Or using uvicorn directly:
+
+```bash
+uvicorn gpu_mem_calculator.web.app:app --reload
+```
+
+Then open your browser to `http://localhost:8000`
+
+### Python API
+
+#### Training Memory Calculation
+
+```python
+from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+from gpu_mem_calculator.core.models import (
+    ModelConfig,
+    TrainingConfig,
+    ParallelismConfig,
+    EngineConfig,
+    GPUConfig,
+)
+
+# Create configuration
+model_config = ModelConfig(
+    name="llama2-7b",
+    num_parameters=7_000_000_000,
+    num_layers=32,
+    hidden_size=4096,
+    num_attention_heads=32,
+    vocab_size=32000,
+    max_seq_len=4096,
+)
+
+training_config = TrainingConfig(
+    batch_size=4,
+    gradient_accumulation_steps=4,
+    dtype="bf16",
+    optimizer="adamw",
+)
+
+parallelism_config = ParallelismConfig(
+    data_parallel_size=8,
+)
+
+engine_config = EngineConfig(
+    type="deepspeed",
+    zero_stage=3,
+    offload_optimizer="cpu",
+)
+
+gpu_config = GPUConfig(
+    num_gpus=8,
+    gpu_memory_gb=80,
+)
+
+# Calculate memory
+calculator = GPUMemoryCalculator(
+    model_config=model_config,
+    training_config=training_config,
+    parallelism_config=parallelism_config,
+    engine_config=engine_config,
+    gpu_config=gpu_config,
+)
+
+result = calculator.calculate()
+
+print(f"Memory per GPU: {result.total_memory_per_gpu_gb:.2f} GB")
+print(f"Fits on GPU: {result.fits_on_gpu}")
+print(f"Utilization: {result.memory_utilization_percent:.1f}%")
+```
+
+#### 🆕 Inference Memory Calculation
+
+```python
+from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
+from gpu_mem_calculator.core.models import (
+    ModelConfig,
+    InferenceConfig,
+    InferenceEngineType,
+    GPUConfig,
+)
+
+# Create configurations
+model_config = ModelConfig(
+    name="llama2-7b",
+    num_parameters=7_000_000_000,
+    num_layers=32,
+    hidden_size=4096,
+    num_attention_heads=32,
+    max_seq_len=4096,
+)
+
+inference_config = InferenceConfig(
+    batch_size=32,
+    kv_cache_quantization="int8",  # NONE, INT8, FP8, INT4
+    tensor_parallel_size=2,
+    gpu_memory_utilization=0.9,
+)
+
+gpu_config = GPUConfig(num_gpus=2, gpu_memory_gb=80)
+
+# Calculate for different inference engines
+calculator = InferenceMemoryCalculator(model_config, inference_config, gpu_config)
+
+# vLLM inference
+result_vllm = calculator.calculate(InferenceEngineType.VLLM)
+print(f"vLLM: {result_vllm.total_memory_per_gpu_gb:.2f} GB")
+print(f"Max batch size: {result_vllm.max_supported_batch_size}")
+print(f"Throughput: {result_vllm.estimated_throughput_tokens_per_sec:.0f} tokens/sec")
+
+# TensorRT-LLM inference
+result_trt = calculator.calculate(InferenceEngineType.TENSORRT_LLM)
+print(f"TensorRT-LLM: {result_trt.total_memory_per_gpu_gb:.2f} GB")
+```
+
+#### 🆕 Multi-Node Network Overhead
+
+```python
+from gpu_mem_calculator.core.multinode import MultiNodeCalculator
+from gpu_mem_calculator.core.models import (
+    NodeConfig,
+    InterconnectType,
+)
+
+# Configure multi-node setup
+node_config = NodeConfig(
+    num_nodes=4,
+    gpus_per_node=8,
+    interconnect_type=InterconnectType.INFINIBAND,
+)
+
+calculator = MultiNodeCalculator(
+    model_config=model_config,
+    training_config=training_config,
+    parallelism_config=parallelism_config,
+    node_config=node_config,
+    engine_config=engine_config,
+)
+
+# Calculate network overhead
+network_overhead = calculator.calculate_network_overhead()
+print(f"AllReduce: {network_overhead.allreduce_gb:.2f} GB")
+print(f"AllGather: {network_overhead.allgather_gb:.2f} GB")
+print(f"Time overhead: {network_overhead.estimated_overhead_ms_per_step:.2f} ms/step")
+
+# Optimize hybrid parallelism
+from gpu_mem_calculator.core.models import HybridParallelismConfig
+
+hybrid_config = HybridParallelismConfig(
+    auto_optimize=True,
+    prefer_pipeline_parallel=True,
+    enable_sequence_parallel=True,
+)
+
+optimized_parallelism = calculator.optimize_hybrid_parallelism(hybrid_config)
+print(f"Optimized TP: {optimized_parallelism.tensor_parallel_size}")
+print(f"Optimized PP: {optimized_parallelism.pipeline_parallel_size}")
+print(f"Optimized DP: {optimized_parallelism.data_parallel_size}")
+```
+
+#### 🆕 Export Framework Configurations
+
+```python
+from gpu_mem_calculator.exporters.manager import ExportManager, ExportFormat
+
+# Create export manager
+manager = ExportManager(
+    model_config=model_config,
+    training_config=training_config,
+    parallelism_config=parallelism_config,
+    engine_config=engine_config,
+    node_config=node_config,
+)
+
+# Export to different formats
+accelerate_config = manager.export(ExportFormat.ACCELERATE)
+lightning_config = manager.export(ExportFormat.LIGHTNING)
+axolotl_config = manager.export(ExportFormat.AXOLOTL)
+
+# Export to file
+manager.export_to_file(ExportFormat.ACCELERATE, "accelerate_config.yaml")
+manager.export_to_file(ExportFormat.JSON, "config.json")
+
+# Get DeepSpeed config
+deepspeed_config = manager.export(ExportFormat.DEEPSPEED)
+```
+
+## Configuration File Format
+
+```json
+{
+  "model": {
+    "name": "llama2-7b",
+    "num_parameters": "7B",
+    "num_layers": 32,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "vocab_size": 32000,
+    "max_seq_len": 4096
+  },
+  "training": {
+    "batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "optimizer": "adamw",
+    "dtype": "bf16",
+    "activation_checkpointing": 1
+  },
+  "parallelism": {
+    "tensor_parallel_size": 1,
+    "pipeline_parallel_size": 1,
+    "data_parallel_size": 8,
+    "sequence_parallel": false
+  },
+  "engine": {
+    "type": "deepspeed",
+    "zero_stage": 3,
+    "offload_optimizer": "cpu",
+    "offload_param": "none"
+  },
+  "hardware": {
+    "num_gpus": 8,
+    "gpu_memory_gb": 80
+  }
+}
+```
+
+## Supported Training Engines
+
+### PyTorch DDP (Baseline)
+Standard Distributed Data Parallel training without memory optimizations.
+
+### DeepSpeed ZeRO
+- **ZeRO-1**: Shard optimizer states
+- **ZeRO-2**: Shard optimizer states + gradients
+- **ZeRO-3**: Shard everything (parameters, gradients, optimizer states)
+- Supports CPU/NVMe offloading
+
+### Megatron-LM
+Tensor and pipeline parallelism with activation checkpointing support.
+
+### Megatron + DeepSpeed
+Combines Megatron-LM's model parallelism with DeepSpeed ZeRO's optimizer sharding.
+
+### PyTorch FSDP
+Fully Sharded Data Parallel with multiple sharding strategies.
+
+## Memory Formulas
+
+The calculator uses formulas verified against authoritative sources:
+
+### Base Components
+
+**Model Parameters:**
+- FP16/BF16: `num_params × 2 bytes`
+- FP32: `num_params × 4 bytes`
+
+**Gradients:**
+- FP16/BF16: `num_params × 2 bytes`
+- FP32: `num_params × 4 bytes`
+
+**Optimizer States** (per optimizer type):
+- **Adam/AdamW**: `num_params × 12 bytes`
+  - 4 bytes: FP32 parameter copy
+  - 4 bytes: Momentum
+  - 4 bytes: Variance
+- **AdamW 8-bit**: `num_params × 2 bytes` (quantized)
+- **SGD**: `num_params × 4 bytes` (FP32 only, no momentum)
+
+**Activations:**
+- Approximation: `batch_size × seq_len × hidden_size × num_layers × ~16 bytes/token/layer`
+- Varies based on activation checkpointing level
+
+### DeepSpeed ZeRO Stages
+
+**ZeRO-0** (Baseline - same as PyTorch DDP):
+```
+total_per_gpu = 2×params + 2×params + 12×params + activations
+             = 16×params + activations
+```
+
+**ZeRO-1** (Shard optimizer states):
+```
+total_per_gpu = 2×params + 2×params + (12×params)/num_gpus + activations
+```
+
+**ZeRO-2** (Shard optimizer + gradients):
+```
+total_per_gpu = 2×params + (2×params)/num_gpus + (12×params)/num_gpus + activations
+```
+
+**ZeRO-3** (Shard everything):
+```
+total_per_gpu = largest_layer_memory + (16×params)/num_gpus + activations
+where largest_layer_memory ≈ 4×(num_params/10)
+```
+
+**CPU/NVMe Offloading:**
+- Optimizer states offloaded to CPU: 0 GB GPU memory
+- Parameters offloaded to CPU/NVMe: Dynamically gathered during compute
+
+### Verification
+
+All formulas have been verified against:
+- ✅ 18 comprehensive test scenarios (100% pass rate)
+- ✅ EleutherAI Transformer Math 101
+- ✅ Microsoft Research ZeRO Blog
+- ✅ DeepSpeed Official Documentation
+- ✅ PyTorch FSDP Documentation
+
+### References
+
+- [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive transformer memory breakdown
+- [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
+- [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
+
+## Example Configurations
+
+### LLaMA 2 7B with DeepSpeed ZeRO-3
+```bash
+gpu-mem-calc calculate --config configs/llama2_7b_deepspeed.json
+```
+
+### GPT-3 175B with Megatron-LM
+```bash
+gpu-mem-calc calculate --config configs/gpt3_175b_megatron.json
+```
+
+### Custom 1B model with PyTorch DDP
+```bash
+gpu-mem-calc calculate --config configs/pytorch_ddp_example.json
+```
+
+## Web UI Features
+
+### Interactive Interface
+- **Real-time Calculations**: Auto-calculates as you adjust parameters (1s debounce)
+- **Client-side Validation**: Instant feedback on configuration errors before API calls
+- **Smart Presets**: Quick-load model configurations (LLaMA 2, GPT-3, GLM, Mixtral, Qwen, DeepSeek)
+- **Visual Breakdown**: Color-coded bar chart with patterns for colorblind accessibility
+- **Feasibility Status**: Clear indicators showing if configuration fits on GPU
+
+### Formula Explanations
+- **Detailed Breakdowns**: See exact formulas used with your values plugged in
+- **Component-by-Component**: Each memory component explained with formula and result
+- **Authoritative References**: Links to EleutherAI, Microsoft Research, DeepSpeed docs
+- **Engine-Specific Details**: Different formulas for PyTorch DDP, DeepSpeed ZeRO, FSDP, Megatron-LM
+
+### Advanced Tools
+- **Export to DeepSpeed**: Generate `deepspeed_config.json` files automatically
+- **Batch Size Optimizer**: Find maximum batch size that fits your GPU memory
+- **Config Persistence**: Save configurations to browser localStorage
+- **Comparison Mode**: Compare different configurations side-by-side
+
+### Accessibility
+- **ARIA Labels**: Full screen reader support throughout the interface
+- **Keyboard Navigation**: All features accessible via keyboard
+- **Colorblind-Friendly**: Patterns and textures supplement colors in charts
+- **High Contrast**: Clear visual indicators with multiple cues
+
+### API Endpoints
+- `POST /api/calculate` - Calculate GPU memory requirements
+- `POST /api/explain-formula` - Get detailed formula explanation
+- `POST /api/export/deepspeed` - Export DeepSpeed config file
+- `POST /api/optimize/batch-size` - Find maximum batch size
+- `GET /api/preset/{preset_name}` - Load model preset
+
+## Development
+
+### Running Tests
+
+```bash
+pytest tests/
+```
+
+### Test Coverage
+
+The calculator includes comprehensive testing:
+- **Unit Tests**: Core calculation logic for each engine type
+- **Integration Tests**: End-to-end configuration validation
+- **Formula Verification**: 18 scenarios verifying formula accuracy
+- **API Tests**: Web API endpoint testing
+- **Accessibility Tests**: Screen reader and keyboard navigation
+
+All formulas verified accurate against authoritative sources with 100% test pass rate.
+
+### Code Formatting
+
+```bash
+black src/ cli/ web/
+ruff check src/ cli/ web/
+```
+
+### Type Checking
+
+```bash
+mypy src/
+```
+
+## Recent Improvements
+
+### Latest Updates
+- ✨ Added formula explanation feature with detailed breakdowns
+- ✨ Added client-side validation for better UX
+- ✨ Added batch size optimizer API
+- ✨ Added DeepSpeed config export functionality
+- ✨ Added comprehensive input validation
+- ✨ Added result caching for performance
+- ♿ Added ARIA labels for full accessibility
+- ♿ Added colorblind patterns to charts
+- 🐛 Fixed optimizer formulas to be optimizer-specific
+- 🐛 Fixed Pydantic namespace warnings
+
+### Verification Status
+- ✅ All 18 test scenarios passing (100%)
+- ✅ Formulas verified against EleutherAI, Microsoft Research, DeepSpeed docs
+- ✅ Optimizer formulas corrected for AdamW, AdamW 8-bit, and SGD
+- ✅ ZeRO stage formulas validated (0, 1, 2, 3)
+- ✅ Engine type formulas validated (PyTorch DDP, DeepSpeed, FSDP, Megatron-LM)
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit a Pull Request. See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
+
+## 📚 References
+
+The memory calculations in this tool are based on authoritative sources:
+
+**Core Memory Formulas:**
+- [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/) - Comprehensive breakdown of transformer memory requirements
+- [Microsoft Research ZeRO Blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) - ZeRO optimization techniques
+- [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2204.13323) - Activation checkpointing strategies
+
+**Engine Documentation:**
+- [DeepSpeed Memory Documentation](https://deepspeed.readthedocs.io/en/latest/memory.html) - Official DeepSpeed memory formulas
+- [NVIDIA Megatron-LM](https://github.com/NVIDIA/Megatron-LM) - Tensor and pipeline parallelism
+- [PyTorch FSDP Documentation](https://pytorch.org/docs/stable/fsdp.html) - Fully sharded data parallel
+- [PyTorch DDP Tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) - Distributed data parallel
+
+**Related Tools:**
+- [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis
+- [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation utilities
+
+## 🤝 Community & Support
+
+- 📖 [Documentation](README.md)
+- 🐛 [Issue Tracker](https://github.com/George614/gpu-mem-calculator/issues)
+- 💬 [Discussions](https://github.com/George614/gpu-mem-calculator/discussions)
+- 📧 Contact the maintainers via GitHub
+
+### Star History
+
+If you find this tool useful, please consider giving it a star! ⭐
+
+## 📋 Roadmap
+
+- [x] Inference memory calculation
+- [x] Multi-node training configurations
+- [x] Export to training framework configs (Accelerate, Lightning, Axolotl)
+- [ ] PyPI package distribution
+- [ ] Support for more model architectures (Vision Transformers, Diffusion models)
+- [ ] Real-time memory monitoring dashboard
+- [ ] CLI commands for inference and export features
+
+## 🙏 Acknowledgments
+
+This tool was inspired by and builds upon the excellent work of:
+- [DeepSpeed Memory Estimator](https://deepspeed.readthedocs.io/en/latest/memory.html) - ZeRO memory optimization formulas
+- [llm-analysis](https://github.com/cli99/llm-analysis) - LLM memory analysis methodology
+- [vram-calculator](https://github.com/furiousteabag/vram-calculator) - VRAM calculation approach
+
+Special thanks to the EleutherAI community for their comprehensive [Transformer Math 101](https://blog.eleuther.ai/transformer-math/) guide, which provides detailed formulas for transformer memory calculations.
+
+## 📄 License
+
+MIT License - see [LICENSE](LICENSE) for details.
+
+## 📚 Citation
+
+If you use this tool in your research, please cite:
+
+```bibtex
+@software{gpu_mem_calculator,
+  title = {GPU Memory Calculator for LLM Training},
+  author = {GPU Mem Calculator Team},
+  year = {2024},
+  url = {https://github.com/George614/gpu-mem-calculator}
+}
+```
+
+---
+
+<p align="center">
+  Made with ❤️ for the ML community
+</p>
+
+<p align="center">
+  <a href="https://github.com/George614/gpu-mem-calculator/stargazers">⭐ Star us on GitHub</a> •
+  <a href="https://github.com/George614/gpu-mem-calculator/issues">🐛 Report a Bug</a> •
+  <a href="https://github.com/George614/gpu-mem-calculator/issues">💡 Request a Feature</a>
+</p>
+
diff --git a/src/gpu_mem_calculator.egg-info/SOURCES.txt b/src/gpu_mem_calculator.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ab30d6f4c4ddbbe18d80cc890b1c66dc873abfc
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/SOURCES.txt
@@ -0,0 +1,46 @@
+LICENSE
+README.md
+pyproject.toml
+src/gpu_mem_calculator/__init__.py
+src/gpu_mem_calculator/py.typed
+src/gpu_mem_calculator.egg-info/PKG-INFO
+src/gpu_mem_calculator.egg-info/SOURCES.txt
+src/gpu_mem_calculator.egg-info/dependency_links.txt
+src/gpu_mem_calculator.egg-info/entry_points.txt
+src/gpu_mem_calculator.egg-info/requires.txt
+src/gpu_mem_calculator.egg-info/top_level.txt
+src/gpu_mem_calculator/cli/__init__.py
+src/gpu_mem_calculator/cli/main.py
+src/gpu_mem_calculator/config/__init__.py
+src/gpu_mem_calculator/config/parser.py
+src/gpu_mem_calculator/config/presets.py
+src/gpu_mem_calculator/core/__init__.py
+src/gpu_mem_calculator/core/calculator.py
+src/gpu_mem_calculator/core/formulas.py
+src/gpu_mem_calculator/core/models.py
+src/gpu_mem_calculator/core/multinode.py
+src/gpu_mem_calculator/engines/__init__.py
+src/gpu_mem_calculator/engines/base.py
+src/gpu_mem_calculator/engines/deepspeed.py
+src/gpu_mem_calculator/engines/fsdp.py
+src/gpu_mem_calculator/engines/megatron.py
+src/gpu_mem_calculator/engines/pytorch.py
+src/gpu_mem_calculator/exporters/__init__.py
+src/gpu_mem_calculator/exporters/accelerate.py
+src/gpu_mem_calculator/exporters/axolotl.py
+src/gpu_mem_calculator/exporters/lightning.py
+src/gpu_mem_calculator/exporters/manager.py
+src/gpu_mem_calculator/inference/__init__.py
+src/gpu_mem_calculator/inference/base.py
+src/gpu_mem_calculator/inference/calculator.py
+src/gpu_mem_calculator/inference/huggingface.py
+src/gpu_mem_calculator/inference/tensorrt_llm.py
+src/gpu_mem_calculator/inference/tgi.py
+src/gpu_mem_calculator/inference/vllm.py
+src/gpu_mem_calculator/utils/__init__.py
+src/gpu_mem_calculator/utils/precision.py
+tests/test_calculator.py
+tests/test_comprehensive.py
+tests/test_exporters.py
+tests/test_inference.py
+tests/test_multinode.py
\ No newline at end of file
diff --git a/src/gpu_mem_calculator.egg-info/dependency_links.txt b/src/gpu_mem_calculator.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/gpu_mem_calculator.egg-info/entry_points.txt b/src/gpu_mem_calculator.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e1fd80a3a8d68ab65662f055ef4d082e9c1c1bf1
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+gpu-mem-calc = gpu_mem_calculator.cli:main
diff --git a/src/gpu_mem_calculator.egg-info/requires.txt b/src/gpu_mem_calculator.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d241d2a18141a662cbb9f832803514e123830104
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/requires.txt
@@ -0,0 +1,16 @@
+pydantic>=2.0.0
+click>=8.1.0
+pydantic-settings>=2.0.0
+rich>=13.0.0
+
+[dev]
+pytest>=7.0.0
+pytest-cov>=4.0.0
+black>=23.0.0
+ruff>=0.1.0
+mypy>=1.5.0
+
+[web]
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+jinja2>=3.1.0
diff --git a/src/gpu_mem_calculator.egg-info/top_level.txt b/src/gpu_mem_calculator.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b26471d7f90301fe8c843f68393d1f9a6065626b
--- /dev/null
+++ b/src/gpu_mem_calculator.egg-info/top_level.txt
@@ -0,0 +1 @@
+gpu_mem_calculator
diff --git a/src/gpu_mem_calculator/__init__.py b/src/gpu_mem_calculator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5c38d92c579824dcc8f2cb8e2e90e74505c2cb
--- /dev/null
+++ b/src/gpu_mem_calculator/__init__.py
@@ -0,0 +1,3 @@
+"""GPU Memory Calculator for LLM Training."""
+
+__version__ = "0.1.0"
diff --git a/src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0672fbc3b0e1a5956876bbb7c37c70870f0285d3
Binary files /dev/null and b/src/gpu_mem_calculator/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/cli/__init__.py b/src/gpu_mem_calculator/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ba3b37df278c165fe3f50a4e6ee520b4217638
--- /dev/null
+++ b/src/gpu_mem_calculator/cli/__init__.py
@@ -0,0 +1,5 @@
+"""CLI interface for GPU Memory Calculator."""
+
+from gpu_mem_calculator.cli.main import main
+
+__all__ = ["main"]
diff --git a/src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..180200bcd8eef434ddcee44e1b5db827548cf013
Binary files /dev/null and b/src/gpu_mem_calculator/cli/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc b/src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ad0e5dff8e68e55f2ea1d97f3114f1e07ec57ac
Binary files /dev/null and b/src/gpu_mem_calculator/cli/__pycache__/main.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/cli/main.py b/src/gpu_mem_calculator/cli/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd35d265a2e992d473bdafdff54fa5c1fba7b5c9
--- /dev/null
+++ b/src/gpu_mem_calculator/cli/main.py
@@ -0,0 +1,399 @@
+"""CLI interface for GPU Memory Calculator."""
+
+import json
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+import click
+
+if TYPE_CHECKING:
+    from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+    from gpu_mem_calculator.core.models import MemoryResult
+
+
+@click.group()
+@click.version_option(version="0.1.0")
+def main() -> None:
+    """GPU Memory Calculator for LLM Training.
+
+    Calculate GPU memory requirements for training Large Language Models
+    with various training engines (PyTorch DDP, DeepSpeed, Megatron-LM, FSDP).
+    """
+    pass
+
+
+@main.command()
+@click.option(
+    "--config",
+    "-c",
+    type=click.Path(exists=True),
+    help="Path to JSON configuration file",
+)
+@click.option(
+    "--preset",
+    "-p",
+    type=str,
+    help="Name of a preset model configuration",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(),
+    help="Output file path (default: stdout)",
+)
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["json", "yaml", "table"]),
+    default="table",
+    help="Output format (default: table)",
+)
+def calculate(
+    config: str | None,
+    preset: str | None,
+    output: str | None,
+    format: Literal["json", "yaml", "table"],
+) -> None:
+    """Calculate GPU memory requirements from config file or preset.
+
+    Examples:
+        gpu-mem-calc calculate --config configs/llama2_7b.json
+        gpu-mem-calc calculate --preset llama2-7b
+        gpu-mem-calc calculate -p mixtral-8x7b --format json
+    """
+    if not config and not preset:
+        click.echo("Error: Either --config or --preset is required", err=True)
+        sys.exit(1)
+
+    if config and preset:
+        click.echo("Error: Cannot use both --config and --preset", err=True)
+        sys.exit(1)
+
+    try:
+        import tempfile
+
+        from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+
+        if preset:
+            # Load preset configuration
+            from gpu_mem_calculator.config.presets import get_preset_config
+
+            preset_config = get_preset_config(preset)
+            if preset_config is None:
+                click.echo(
+                    f"Error: Preset '{preset}' not found. "
+                    "Use 'gpu-mem-calc presets' to list available presets.",
+                    err=True,
+                )
+                sys.exit(1)
+
+            # Write preset to temp file for from_config_file
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+                json.dump(preset_config, f, indent=2)
+                temp_path = f.name
+
+            calculator = GPUMemoryCalculator.from_config_file(temp_path)
+            Path(temp_path).unlink()  # Clean up temp file
+        elif config:
+            calculator = GPUMemoryCalculator.from_config_file(config)
+        else:
+            # This should never happen due to the checks above
+            click.echo("Error: Either --config or --preset is required", err=True)
+            sys.exit(1)
+
+        result = calculator.calculate()
+
+        # Format output
+        if format == "json":
+            output_text = json.dumps(result.model_dump(mode="json"), indent=2)
+        elif format == "yaml":
+            try:
+                import yaml  # type: ignore[import-untyped]
+
+                output_text = yaml.dump(result.model_dump(mode="json"), default_flow_style=False)
+            except ImportError:
+                click.echo(
+                    "Error: YAML format requires PyYAML. Install with: pip install pyyaml",
+                    err=True,
+                )
+                sys.exit(1)
+        else:  # table
+            output_text = _format_result_as_table(result, calculator)
+
+        # Write output
+        if output:
+            Path(output).write_text(output_text)
+            click.echo(f"Results written to {output}")
+        else:
+            click.echo(output_text)
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+@main.command()
+@click.argument(
+    "params",
+    type=float,
+    required=True,
+)
+@click.option(
+    "--gpus",
+    "-g",
+    type=int,
+    default=1,
+    help="Number of GPUs (default: 1)",
+)
+@click.option(
+    "--gpu-mem",
+    "-m",
+    type=float,
+    default=80.0,
+    help="GPU memory in GB (default: 80.0)",
+)
+@click.option(
+    "--engine",
+    "-e",
+    type=click.Choice(["pytorch", "deepspeed", "megatron", "fsdp"]),
+    default="pytorch",
+    help="Training engine (default: pytorch)",
+)
+@click.option(
+    "--dtype",
+    "-d",
+    type=click.Choice(["fp32", "fp16", "bf16"]),
+    default="bf16",
+    help="Data type (default: bf16)",
+)
+def quick(
+    params: float,
+    gpus: int,
+    gpu_mem: float,
+    engine: str,
+    dtype: str,
+) -> None:
+    """Quick calculation from model size (in billions of parameters).
+
+    Example:
+        gpu-mem-calc quick 7 --gpus 8 --engine deepspeed
+    """
+    try:
+        from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+        from gpu_mem_calculator.core.models import (
+            DType,
+            EngineConfig,
+            EngineType,
+            GPUConfig,
+            ModelConfig,
+            ParallelismConfig,
+            TrainingConfig,
+        )
+
+        # Map engine string to EngineType
+        engine_map = {
+            "pytorch": EngineType.PYTORCH_DDP,
+            "deepspeed": EngineType.DEEPSPEED,
+            "megatron": EngineType.MEGATRON_LM,
+            "fsdp": EngineType.FSDP,
+        }
+
+        # Map dtype string to DType
+        dtype_map = {
+            "fp32": DType.FP32,
+            "fp16": DType.FP16,
+            "bf16": DType.BF16,
+        }
+
+        # Create a minimal config for quick calculation
+        # Estimate model architecture from parameter count
+        # Rough approximation based on typical transformer models
+        num_params = int(params * 1e9)
+
+        # Estimate hidden size and layers from param count
+        # These are rough approximations
+        if params <= 1:
+            hidden_size, num_layers = 768, 12
+        elif params <= 7:
+            hidden_size, num_layers = 4096, 32
+        elif params <= 13:
+            hidden_size, num_layers = 5120, 40
+        elif params <= 30:
+            hidden_size, num_layers = 6656, 60
+        elif params <= 65:
+            hidden_size, num_layers = 8192, 80
+        else:
+            hidden_size, num_layers = 12288, 96
+
+        model_config = ModelConfig(
+            name="quick-estimate",
+            num_parameters=num_params,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=hidden_size // 128,
+            vocab_size=32000,
+            max_seq_len=2048,
+        )
+
+        training_config = TrainingConfig(
+            batch_size=1,
+            gradient_accumulation_steps=1,
+            dtype=dtype_map[dtype],
+        )
+
+        parallelism_config = ParallelismConfig(data_parallel_size=gpus)
+
+        engine_config = EngineConfig(
+            type=engine_map[engine],
+            zero_stage=2 if engine == "deepspeed" else None,
+        )
+
+        gpu_config = GPUConfig(num_gpus=gpus, gpu_memory_gb=gpu_mem)
+
+        calculator = GPUMemoryCalculator(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            gpu_config=gpu_config,
+        )
+
+        result = calculator.calculate()
+
+        # Display results
+        click.echo(_format_result_as_table(result, calculator))
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+@main.command()
+@click.argument(
+    "config_path",
+    type=click.Path(exists=True),
+)
+def validate(config_path: str) -> None:
+    """Validate a configuration file.
+
+    Example:
+        gpu-mem-calc validate configs/my_config.json
+    """
+    try:
+        from gpu_mem_calculator.config import ConfigParser
+
+        ConfigParser.parse_full_config(config_path)
+        click.echo(f"✓ Configuration file '{config_path}' is valid")
+
+    except Exception as e:
+        click.echo(f"✗ Validation failed: {e}", err=True)
+        sys.exit(1)
+
+
+@main.command()
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["list", "json", "table"]),
+    default="list",
+    help="Output format (default: list)",
+)
+def presets(format: str) -> None:
+    """List available model preset configurations.
+
+    Examples:
+        gpu-mem-calc presets
+        gpu-mem-calc presets --format table
+        gpu-mem-calc presets -f json
+    """
+    try:
+        from gpu_mem_calculator.config.presets import list_presets
+
+        all_presets = list_presets()
+
+        if not all_presets:
+            click.echo("No presets found.")
+            return
+
+        if format == "json":
+            click.echo(json.dumps(all_presets, indent=2))
+        elif format == "table":
+            from rich.console import Console
+            from rich.table import Table
+
+            console = Console()
+            table = Table(
+                title="Available Model Presets",
+                show_header=True,
+                header_style="bold magenta",
+            )
+            table.add_column("Preset Name", style="cyan", width=25)
+            table.add_column("Display Name", style="green", width=30)
+            table.add_column("Description", style="yellow")
+
+            for name, info in sorted(all_presets.items()):
+                table.add_row(name, info["display_name"], info["description"])
+
+            console.print(table)
+        else:  # list format
+            click.echo("Available model presets:\n")
+            for name, info in sorted(all_presets.items()):  # type: ignore[annotation-unchecked]
+                click.echo(f"  {name:25} - {info['display_name']}")
+                if info.get("description"):
+                    click.echo(f"{'':27}{info['description']}")
+                click.echo()
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+
+
+def _format_result_as_table(result: MemoryResult, calculator: "GPUMemoryCalculator") -> str:
+    """Format result as ASCII table."""
+    from rich.console import Console
+    from rich.table import Table
+
+    console = Console()
+
+    # Main results table
+    table = Table(
+        title="GPU Memory Calculation Results",
+        show_header=True,
+        header_style="bold magenta",
+    )
+    table.add_column("Metric", style="cyan", width=30)
+    table.add_column("Value", style="green")
+
+    # Memory results
+    table.add_row("Memory per GPU", f"{result.total_memory_per_gpu_gb:.2f} GB")
+    table.add_row("Total GPU Memory", f"{result.total_memory_all_gpus_gb:.2f} GB")
+    table.add_row("CPU Memory", f"{result.cpu_memory_gb:.2f} GB")
+    table.add_row("", "")  # Spacer
+
+    # Breakdown
+    table.add_row("Model Parameters", f"{result.breakdown.model_params_gb:.2f} GB")
+    table.add_row("Gradients", f"{result.breakdown.gradients_gb:.2f} GB")
+    table.add_row("Optimizer States", f"{result.breakdown.optimizer_states_gb:.2f} GB")
+    table.add_row("Activations", f"{result.breakdown.activations_gb:.2f} GB")
+    table.add_row("Overhead", f"{result.breakdown.overhead_gb:.2f} GB")
+    table.add_row("", "")  # Spacer
+
+    # Feasibility
+    status = "✓ Fits" if result.fits_on_gpu else "✗ OOM"
+    table.add_row("Status", status)
+    table.add_row("Memory Utilization", f"{result.memory_utilization_percent:.1f}%")
+    if result.recommended_batch_size:
+        table.add_row("Recommended Batch Size", str(result.recommended_batch_size))
+
+    # Capture table output
+    from io import StringIO
+
+    buffer = StringIO()
+    console.file = buffer
+    console.print(table)
+    return buffer.getvalue()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/gpu_mem_calculator/config/__init__.py b/src/gpu_mem_calculator/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5fe79109caffb06ba7b153482f1555ef0247cd5
--- /dev/null
+++ b/src/gpu_mem_calculator/config/__init__.py
@@ -0,0 +1,5 @@
+"""Configuration parsing and defaults."""
+
+from gpu_mem_calculator.config.parser import ConfigParser, load_config, save_config
+
+__all__ = ["ConfigParser", "load_config", "save_config"]
diff --git a/src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6845d43ad5c81d022d051de440eea3d3d471b4d
Binary files /dev/null and b/src/gpu_mem_calculator/config/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc b/src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b793d40e6d97a328acf7af3d275d75075cddefa
Binary files /dev/null and b/src/gpu_mem_calculator/config/__pycache__/parser.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc b/src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6101f11f5352897241db4893e9b64fd071dfc8f1
Binary files /dev/null and b/src/gpu_mem_calculator/config/__pycache__/presets.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/config/parser.py b/src/gpu_mem_calculator/config/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..182b19d437f6ab1befaa129afbec5cd90d6f5f71
--- /dev/null
+++ b/src/gpu_mem_calculator/config/parser.py
@@ -0,0 +1,323 @@
+"""Configuration file parser and utilities."""
+
+import json
+from pathlib import Path
+from typing import Any, cast
+
+from pydantic import ValidationError
+
+from gpu_mem_calculator.core.models import (
+    DType,
+    EngineConfig,
+    EngineType,
+    GPUConfig,
+    ModelConfig,
+    OffloadDevice,
+    OptimizerType,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+
+class ConfigParseError(Exception):
+    """Error parsing configuration file."""
+
+    def __init__(self, message: str, errors: list[Any] | None = None):
+        super().__init__(message)
+        self.errors = errors or []
+
+
+class ConfigParser:
+    """Parse and validate configuration files."""
+
+    @staticmethod
+    def _convert_dtype(value: str) -> DType:
+        """Convert string dtype to DType enum."""
+        dtype_map = {
+            "float32": DType.FP32,
+            "fp32": DType.FP32,
+            "float16": DType.FP16,
+            "fp16": DType.FP16,
+            "bfloat16": DType.BF16,
+            "bf16": DType.BF16,
+            "int8": DType.INT8,
+            "int4": DType.INT4,
+        }
+        return dtype_map.get(value.lower(), DType.BF16)
+
+    @staticmethod
+    def _convert_optimizer(value: str) -> OptimizerType:
+        """Convert string optimizer to OptimizerType enum."""
+        opt_map = {
+            "adam": OptimizerType.ADAM,
+            "adamw": OptimizerType.ADAMW,
+            "sgd": OptimizerType.SGD,
+            "adamw_8bit": OptimizerType.ADAMW_8BIT,
+            "adamw-8bit": OptimizerType.ADAMW_8BIT,
+        }
+        return opt_map.get(value.lower(), OptimizerType.ADAMW)
+
+    @staticmethod
+    def _convert_engine(value: str) -> EngineType:
+        """Convert string engine to EngineType enum."""
+        engine_map = {
+            "pytorch": EngineType.PYTORCH_DDP,
+            "pytorch_ddp": EngineType.PYTORCH_DDP,
+            "ddp": EngineType.PYTORCH_DDP,
+            "deepspeed": EngineType.DEEPSPEED,
+            "megatron": EngineType.MEGATRON_LM,
+            "megatron_lm": EngineType.MEGATRON_LM,
+            "megatron-lm": EngineType.MEGATRON_LM,
+            "fsdp": EngineType.FSDP,
+            "megatron_deepspeed": EngineType.MEGATRON_DEEPSPEED,
+        }
+        return engine_map.get(value.lower(), EngineType.PYTORCH_DDP)
+
+    @staticmethod
+    def _convert_offload(value: str) -> OffloadDevice:
+        """Convert string offload to OffloadDevice enum."""
+        offload_map = {
+            "none": OffloadDevice.NONE,
+            "cpu": OffloadDevice.CPU,
+            "nvme": OffloadDevice.NVME,
+        }
+        return offload_map.get(value.lower(), OffloadDevice.NONE)
+
+    @staticmethod
+    def _parse_num_params(value: str | int | float) -> int:
+        """Parse number of parameters from various formats.
+
+        Supports:
+        - Raw integer: 7000000000
+        - Billions: "7B", "7b", "7e9"
+        - Millions: "7000M", "7000m", "7000e6"
+        """
+        if isinstance(value, int):
+            return value
+        if isinstance(value, float):
+            return int(value)
+
+        if isinstance(value, str):
+            value = value.strip().upper()
+
+            # Handle billions suffix
+            if value.endswith("B"):
+                return int(float(value[:-1]) * 1_000_000_000)
+
+            # Handle millions suffix
+            if value.endswith("M"):
+                return int(float(value[:-1]) * 1_000_000)
+
+            # Handle scientific notation
+            if "E" in value:
+                return int(float(value))
+
+            # Try direct conversion
+            return int(value)
+
+        raise ValueError(f"Cannot parse parameter count: {value}")
+
+    @classmethod
+    def parse_model_config(cls, data: dict[str, Any]) -> ModelConfig:
+        """Parse model configuration from dict.
+
+        Args:
+            data: Dictionary with model configuration
+
+        Returns:
+            ModelConfig object
+
+        Raises:
+            ConfigParseError: If validation fails
+        """
+        try:
+            # Convert parameter count if it's a string
+            if "num_parameters" in data and isinstance(data["num_parameters"], str):
+                data["num_parameters"] = cls._parse_num_params(data["num_parameters"])
+
+            if "largest_layer_params" in data and isinstance(data["largest_layer_params"], str):
+                data["largest_layer_params"] = cls._parse_num_params(data["largest_layer_params"])
+
+            return ModelConfig(**data)
+        except ValidationError as e:
+            raise ConfigParseError("Invalid model configuration", e.errors()) from e
+
+    @classmethod
+    def parse_training_config(cls, data: dict[str, Any]) -> TrainingConfig:
+        """Parse training configuration from dict.
+
+        Args:
+            data: Dictionary with training configuration
+
+        Returns:
+            TrainingConfig object
+
+        Raises:
+            ConfigParseError: If validation fails
+        """
+        try:
+            # Convert dtype
+            if "dtype" in data and isinstance(data["dtype"], str):
+                data["dtype"] = cls._convert_dtype(data["dtype"])
+
+            # Convert optimizer
+            if "optimizer" in data and isinstance(data["optimizer"], str):
+                data["optimizer"] = cls._convert_optimizer(data["optimizer"])
+
+            return TrainingConfig(**data)
+        except ValidationError as e:
+            raise ConfigParseError("Invalid training configuration", e.errors()) from e
+
+    @classmethod
+    def parse_parallelism_config(cls, data: dict[str, Any]) -> ParallelismConfig:
+        """Parse parallelism configuration from dict.
+
+        Args:
+            data: Dictionary with parallelism configuration
+
+        Returns:
+            ParallelismConfig object
+
+        Raises:
+            ConfigParseError: If validation fails
+        """
+        try:
+            return ParallelismConfig(**data)
+        except ValidationError as e:
+            raise ConfigParseError("Invalid parallelism configuration", e.errors()) from e
+
+    @classmethod
+    def parse_engine_config(cls, data: dict[str, Any]) -> EngineConfig:
+        """Parse engine configuration from dict.
+
+        Args:
+            data: Dictionary with engine configuration
+
+        Returns:
+            EngineConfig object
+
+        Raises:
+            ConfigParseError: If validation fails
+        """
+        try:
+            # Convert engine type
+            if "type" in data and isinstance(data["type"], str):
+                data["type"] = cls._convert_engine(data["type"])
+
+            # Convert offload options
+            if "offload_optimizer" in data and isinstance(data["offload_optimizer"], str):
+                data["offload_optimizer"] = cls._convert_offload(data["offload_optimizer"])
+
+            if "offload_param" in data and isinstance(data["offload_param"], str):
+                data["offload_param"] = cls._convert_offload(data["offload_param"])
+
+            return EngineConfig(**data)
+        except ValidationError as e:
+            raise ConfigParseError("Invalid engine configuration", e.errors()) from e
+
+    @classmethod
+    def parse_gpu_config(cls, data: dict[str, Any]) -> GPUConfig:
+        """Parse GPU configuration from dict.
+
+        Args:
+            data: Dictionary with GPU configuration
+
+        Returns:
+            GPUConfig object
+
+        Raises:
+            ConfigParseError: If validation fails
+        """
+        try:
+            return GPUConfig(**data)
+        except ValidationError as e:
+            raise ConfigParseError("Invalid GPU configuration", e.errors()) from e
+
+    @classmethod
+    def parse_file(cls, config_path: str | Path) -> dict[str, Any]:
+        """Parse configuration from JSON file.
+
+        Args:
+            config_path: Path to configuration file
+
+        Returns:
+            Dictionary with parsed configuration
+
+        Raises:
+            ConfigParseError: If file cannot be read or parsed
+        """
+        path = Path(config_path)
+        if not path.exists():
+            raise ConfigParseError(f"Configuration file not found: {config_path}")
+
+        try:
+            with path.open("r") as f:
+                data = cast(dict[str, Any], json.load(f))
+            return data
+        except json.JSONDecodeError as e:
+            raise ConfigParseError(f"Invalid JSON in configuration file: {e}") from e
+        except Exception as e:
+            raise ConfigParseError(f"Error reading configuration file: {e}") from e
+
+    @classmethod
+    def parse_full_config(
+        cls,
+        config_path: str | Path,
+    ) -> tuple[ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig]:
+        """Parse complete configuration from file.
+
+        Args:
+            config_path: Path to configuration file
+
+        Returns:
+            Tuple of (ModelConfig, TrainingConfig, ParallelismConfig, EngineConfig, GPUConfig)
+
+        Raises:
+            ConfigParseError: If validation fails
+        """
+        data = cls.parse_file(config_path)
+
+        try:
+            model_config = cls.parse_model_config(data.get("model", {}))
+            training_config = cls.parse_training_config(data.get("training", {}))
+            parallelism_config = cls.parse_parallelism_config(data.get("parallelism", {}))
+            engine_config = cls.parse_engine_config(data.get("engine", {}))
+            gpu_config = cls.parse_gpu_config(data.get("hardware", {}))
+
+            return (
+                model_config,
+                training_config,
+                parallelism_config,
+                engine_config,
+                gpu_config,
+            )
+        except ConfigParseError:
+            raise
+        except Exception as e:
+            raise ConfigParseError(f"Unexpected error parsing configuration: {e}") from e
+
+
+def load_config(config_path: str | Path) -> dict[str, Any]:
+    """Load configuration from file.
+
+    Args:
+        config_path: Path to configuration file
+
+    Returns:
+        Dictionary with configuration data
+    """
+    return ConfigParser.parse_file(config_path)
+
+
+def save_config(data: dict[str, Any], output_path: str | Path) -> None:
+    """Save configuration to JSON file.
+
+    Args:
+        data: Configuration dictionary to save
+        output_path: Path to save configuration file
+    """
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    with path.open("w") as f:
+        json.dump(data, f, indent=2)
diff --git a/src/gpu_mem_calculator/config/presets.py b/src/gpu_mem_calculator/config/presets.py
new file mode 100644
index 0000000000000000000000000000000000000000..043183c9225ffa0c8f6d005b7b840a550c563f64
--- /dev/null
+++ b/src/gpu_mem_calculator/config/presets.py
@@ -0,0 +1,83 @@
+"""Preset model configurations loader.
+
+This module provides a centralized location for managing model preset
+configurations that can be used by both CLI and web interfaces.
+"""
+
+import json
+from pathlib import Path
+from typing import Any, cast
+
+# Base directory for the package
+BASE_DIR = Path(__file__).parent.parent.parent.parent
+
+
+def get_presets_file_path() -> Path:
+    """Get the path to the presets JSON file.
+
+    Returns:
+        Path to the presets JSON file
+    """
+    # Check for web/presets/models.json relative to project root
+    presets_path = BASE_DIR / "web" / "presets" / "models.json"
+    if presets_path.exists():
+        return presets_path
+
+    # Fallback to src directory for development installs
+    presets_path = BASE_DIR / "src" / "gpu_mem_calculator" / "presets" / "models.json"
+    return presets_path
+
+
+def load_presets() -> dict[str, dict[str, Any]]:
+    """Load all preset model configurations.
+
+    Returns:
+        Dictionary mapping preset names to their configurations.
+        Each preset has: display_name, description, config
+    """
+    presets_file = get_presets_file_path()
+
+    if not presets_file.exists():
+        return {}
+
+    try:
+        with presets_file.open("r") as f:
+            return cast(dict[str, dict[str, Any]], json.load(f))
+    except (json.JSONDecodeError, OSError):
+        return {}
+
+
+def get_preset_config(preset_name: str) -> dict[str, Any] | None:
+    """Get a specific preset configuration.
+
+    Args:
+        preset_name: Name of the preset to retrieve
+
+    Returns:
+        Preset configuration dict, or None if not found
+    """
+    presets = load_presets()
+    preset = presets.get(preset_name)
+
+    if preset is None:
+        return None
+
+    # Return just the config part (what the calculator needs)
+    return cast(dict[str, Any], preset.get("config", {}))
+
+
+def list_presets() -> dict[str, dict[str, str]]:
+    """List all available presets with metadata.
+
+    Returns:
+        Dictionary mapping preset names to their display metadata.
+        Each entry has: display_name, description
+    """
+    presets = load_presets()
+    return {
+        name: {
+            "display_name": preset.get("display_name", name),
+            "description": preset.get("description", ""),
+        }
+        for name, preset in presets.items()
+    }
diff --git a/src/gpu_mem_calculator/core/__init__.py b/src/gpu_mem_calculator/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ce84261ff34549a4bdd55250a7b39ad9f51226
--- /dev/null
+++ b/src/gpu_mem_calculator/core/__init__.py
@@ -0,0 +1,24 @@
+"""Core memory calculation models and formulas."""
+
+from gpu_mem_calculator.core.formulas import Precision
+from gpu_mem_calculator.core.models import (
+    EngineConfig,
+    EngineType,
+    GPUConfig,
+    ModelConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+__all__ = [
+    "ModelConfig",
+    "TrainingConfig",
+    "ParallelismConfig",
+    "EngineConfig",
+    "EngineType",
+    "GPUConfig",
+    "Precision",
+]
+
+# Import GPUMemoryCalculator separately to avoid circular import
+# Use: from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
diff --git a/src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f36a3e0ca26299eac66e2ce946fa1e4001d658b
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f23111875c25e5c6e16ba9043021e526c71899d4
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/calculator.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b34dac9fa4f2a4fc6a2beacc8d68bdf6e262383b
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/formulas.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c91a728c36b563dace22e719313ad40c450cc23f
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/models.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc b/src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8cf9aad04dac7f175f6676b17fc7c3e50490c09
Binary files /dev/null and b/src/gpu_mem_calculator/core/__pycache__/multinode.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/core/calculator.py b/src/gpu_mem_calculator/core/calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d894a8b97ca7d4ae2268b175c6bf42f59302a6
--- /dev/null
+++ b/src/gpu_mem_calculator/core/calculator.py
@@ -0,0 +1,178 @@
+"""Main GPU memory calculator.
+
+Orchestrates the memory calculation by selecting the appropriate
+training engine and aggregating results.
+"""
+
+from gpu_mem_calculator.config.parser import ConfigParser
+from gpu_mem_calculator.core.models import (
+    EngineConfig,
+    EngineType,
+    GPUConfig,
+    MemoryResult,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+from gpu_mem_calculator.engines import (
+    DeepSpeedEngine,
+    FSDPEngine,
+    MegatronDeepSpeedEngine,
+    MegatronLMEngine,
+    PyTorchDDPEngine,
+)
+
+# Type alias for engine types
+EngineTypeAlias = (
+    PyTorchDDPEngine | DeepSpeedEngine | MegatronLMEngine | FSDPEngine | MegatronDeepSpeedEngine
+)
+
+
+class GPUMemoryCalculator:
+    """Main GPU memory calculator.
+
+    This class provides a high-level interface for calculating
+    GPU memory requirements for LLM training.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig | None = None,
+        engine_config: EngineConfig | None = None,
+        gpu_config: GPUConfig | None = None,
+        node_config: NodeConfig | None = None,
+    ) -> None:
+        """Initialize the calculator.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings (default: no parallelism)
+            engine_config: Training engine configuration (default: PyTorch DDP)
+            gpu_config: Hardware configuration (default: 1x 80GB GPU)
+            node_config: Multi-node configuration (default: single node)
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config or ParallelismConfig()
+        self.engine_config = engine_config or EngineConfig()
+        self.gpu_config = gpu_config or GPUConfig()
+        self.node_config = node_config or NodeConfig()
+
+    def calculate(self) -> MemoryResult:
+        """Calculate GPU memory requirements.
+
+        Selects the appropriate training engine based on configuration
+        and returns the memory calculation result.
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        engine = self._get_engine()
+        return engine.calculate_memory()
+
+    def _get_engine(self) -> EngineTypeAlias:
+        """Get the appropriate training engine instance.
+
+        Returns:
+            Engine instance configured with current settings
+        """
+        match self.engine_config.type:
+            case EngineType.PYTORCH_DDP:
+                return PyTorchDDPEngine(
+                    model_config=self.model_config,
+                    training_config=self.training_config,
+                    parallelism_config=self.parallelism_config,
+                    engine_config=self.engine_config,
+                    gpu_config=self.gpu_config,
+                    node_config=self.node_config,
+                )
+            case EngineType.DEEPSPEED:
+                return DeepSpeedEngine(
+                    model_config=self.model_config,
+                    training_config=self.training_config,
+                    parallelism_config=self.parallelism_config,
+                    engine_config=self.engine_config,
+                    gpu_config=self.gpu_config,
+                    node_config=self.node_config,
+                )
+            case EngineType.MEGATRON_LM:
+                return MegatronLMEngine(
+                    model_config=self.model_config,
+                    training_config=self.training_config,
+                    parallelism_config=self.parallelism_config,
+                    engine_config=self.engine_config,
+                    gpu_config=self.gpu_config,
+                    node_config=self.node_config,
+                )
+            case EngineType.FSDP:
+                return FSDPEngine(
+                    model_config=self.model_config,
+                    training_config=self.training_config,
+                    parallelism_config=self.parallelism_config,
+                    engine_config=self.engine_config,
+                    gpu_config=self.gpu_config,
+                    node_config=self.node_config,
+                )
+            case EngineType.MEGATRON_DEEPSPEED:
+                return MegatronDeepSpeedEngine(
+                    model_config=self.model_config,
+                    training_config=self.training_config,
+                    parallelism_config=self.parallelism_config,
+                    engine_config=self.engine_config,
+                    gpu_config=self.gpu_config,
+                    node_config=self.node_config,
+                )
+            case _:
+                # Default to PyTorch DDP
+                return PyTorchDDPEngine(
+                    model_config=self.model_config,
+                    training_config=self.training_config,
+                    parallelism_config=self.parallelism_config,
+                    engine_config=self.engine_config,
+                    gpu_config=self.gpu_config,
+                    node_config=self.node_config,
+                )
+
+    @classmethod
+    def from_config_file(
+        cls,
+        config_path: str,
+    ) -> "GPUMemoryCalculator":
+        """Create calculator from configuration file.
+
+        Args:
+            config_path: Path to JSON configuration file
+
+        Returns:
+            Configured GPUMemoryCalculator instance
+        """
+        model_config, training_config, parallelism_config, engine_config, gpu_config = (
+            ConfigParser.parse_full_config(config_path)
+        )
+
+        return cls(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            gpu_config=gpu_config,
+        )
+
+    def to_dict(self) -> dict:
+        """Export calculator configuration to dictionary.
+
+        Returns:
+            Dictionary with all configuration
+        """
+        return {
+            "model": self.model_config.model_dump(),
+            "training": self.training_config.model_dump(),
+            "parallelism": self.parallelism_config.model_dump(),
+            "engine": self.engine_config.model_dump(),
+            "hardware": self.gpu_config.model_dump(),
+            "multinode": self.node_config.model_dump(),
+        }
diff --git a/src/gpu_mem_calculator/core/formulas.py b/src/gpu_mem_calculator/core/formulas.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f14a2003ba8684934f34151956a925538dd89f
--- /dev/null
+++ b/src/gpu_mem_calculator/core/formulas.py
@@ -0,0 +1,268 @@
+"""Memory calculation formulas.
+
+This module contains the fundamental formulas for calculating GPU memory
+requirements for LLM training.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class Precision:
+    """Precision information for a data type.
+
+    This is re-exported from utils.precision for convenience.
+    """
+
+    name: str
+    bits_per_param: int
+    bytes_per_param: float
+    is_integer: bool = False
+
+
+def calculate_parameter_memory(
+    num_params: int,
+    dtype: str,
+    num_gpus: int = 1,
+) -> float:
+    """Calculate memory in GB for model parameters.
+
+    Args:
+        num_params: Number of model parameters
+        dtype: Data type (e.g., "fp32", "fp16", "bf16", "int8", "int4")
+        num_gpus: Number of GPUs for distribution
+
+    Returns:
+        Memory in GB
+    """
+    from gpu_mem_calculator.utils.precision import gb_from_params
+
+    # Parameters are distributed across GPUs in data parallel training
+    # But for tensor/pipeline parallel, each GPU holds a portion
+    # We'll handle parallelism in the engine implementations
+    return gb_from_params(num_params, dtype)
+
+
+def calculate_gradient_memory(
+    num_params: int,
+    dtype: str,
+) -> float:
+    """Calculate memory in GB for gradients.
+
+    Gradients are typically stored in the same precision as parameters
+    for training (though updated in FP32).
+
+    Args:
+        num_params: Number of model parameters
+        dtype: Data type for gradients
+
+    Returns:
+        Memory in GB
+    """
+    from gpu_mem_calculator.utils.precision import gb_from_params
+
+    # Gradients are same size as parameters during training
+    return gb_from_params(num_params, dtype)
+
+
+def calculate_optimizer_memory(
+    num_params: int,
+    optimizer: str,
+) -> float:
+    """Calculate memory in GB for optimizer states.
+
+    Args:
+        num_params: Number of model parameters
+        optimizer: Optimizer type (adam, adamw, sgd, adamw_8bit)
+
+    Returns:
+        Memory in GB (for FP32 optimizer states)
+    """
+    from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+    # Optimizer states are typically stored in FP32
+    # bytes_per_param = 4.0  # FP32
+
+    match optimizer.lower():
+        case "adam" | "adamw":
+            # Adam/AdamW optimizer states: 12 bytes per param
+            # - FP32 parameter copy: 4 bytes
+            # - Momentum (fp32): 4 bytes
+            # - Variance (fp32): 4 bytes
+            # Reference: https://blog.eleuther.ai/transformer-math/#optimizer-states
+            # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+            # Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+            optimizer_bytes_per_param = 12.0
+        case "adamw_8bit":
+            # 8-bit Adam: ~2 bytes per param (quantized states)
+            # Reference: bitsandbytes 8-bit optimizer
+            optimizer_bytes_per_param = 2.0
+        case "sgd":
+            # SGD: momentum (4 bytes) if using momentum, 0 if not
+            # Assuming momentum is used
+            optimizer_bytes_per_param = 4.0
+        case _:
+            # Default to Adam
+            optimizer_bytes_per_param = 12.0
+
+    total_bytes = num_params * optimizer_bytes_per_param
+    return gb_from_bytes(total_bytes)
+
+
+def calculate_activation_memory(
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    num_layers: int,
+    num_attention_heads: int,
+    tensor_parallel_size: int = 1,
+    activation_checkpointing: int = 0,
+    moe_enabled: bool = False,
+    num_experts: int = 1,
+    top_k: int = 1,
+    expert_intermediate_size: int | None = None,
+) -> float:
+    """Calculate approximate memory in GB for activations.
+
+    This provides an estimate based on transformer architecture. Actual
+    activation memory depends on many factors including the specific
+    model implementation and framework.
+
+    Reference: https://blog.eleuther.ai/transformer-math/#activations
+    Reference: https://arxiv.org/abs/2204.13323 ("Reducing Activation Recomputation
+               in Large Transformer Models")
+
+    According to EleutherAI Transformer Math 101, for selective activation
+    checkpointing (the most common approach), the formula is:
+
+        sbhL(10 + 24/t) bytes
+
+    Where:
+    - s = sequence length (seq_len)
+    - b = batch size (batch_size)
+    - h = hidden size (hidden_size)
+    - L = number of layers (num_layers)
+    - t = tensor parallel size (tensor_parallel_size)
+
+    This implementation uses a simplified heuristic that approximates
+    this formula: hidden_size * 16 bytes per token per layer. This
+    provides a reasonable estimate for typical model configurations
+    while being simple to understand and modify.
+
+    For MoE models, activation memory is reduced because only top_k experts
+    are active per token, not all experts.
+
+    Args:
+        batch_size: Batch size per GPU
+        seq_len: Sequence length
+        hidden_size: Hidden dimension size
+        num_layers: Number of transformer layers
+        num_attention_heads: Number of attention heads
+        tensor_parallel_size: Tensor parallelism degree
+        activation_checkpointing: Checkpointing level (0-4)
+        moe_enabled: Whether model uses Mixture of Experts
+        num_experts: Total number of experts (for MoE)
+        top_k: Number of active experts per token (for MoE)
+        expert_intermediate_size: Expert intermediate layer size (for MoE)
+
+    Returns:
+        Memory in GB
+    """
+    from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+    # Approximate activation memory per token per layer
+    # Based on EleutherAI formula: sbhL(10 + 24/t)
+    # For t=1: ~10-24 bytes per token per layer depending on architecture
+    # We use 16 as a middle-ground estimate
+    # This includes attention outputs, MLP activations, layer norms, etc.
+
+    bytes_per_token_per_layer = hidden_size * 16  # Heuristic estimate
+
+    # For MoE models, adjust activation memory based on active experts
+    moe_multiplier = 1.0
+    if moe_enabled and num_experts > 1:
+        # Only top_k experts are active per token
+        # Base ratio of active experts
+        expert_ratio = top_k / num_experts
+
+        # Add router overhead (gating network activations)
+        router_overhead = 0.1
+
+        moe_multiplier = min(1.0, expert_ratio + router_overhead)
+
+    # For MoE, experts typically have larger intermediate sizes
+    if moe_enabled and expert_intermediate_size:
+        # Scale up slightly for larger expert intermediate layers
+        # Typical expert intermediate size is 4x hidden_size (vs 2x for dense)
+        size_ratio = expert_intermediate_size / (hidden_size * 2)
+        moe_multiplier *= min(2.0, size_ratio)  # Cap at 2x increase
+
+    # Total activation memory
+    total_bytes = (
+        batch_size
+        * seq_len
+        * num_layers
+        * bytes_per_token_per_layer
+        * moe_multiplier
+        / tensor_parallel_size
+    )
+
+    # Adjust for activation checkpointing
+    # Level 0: No checkpointing (100% memory)
+    # Level 1: Checkpoint attention output (~80% memory)
+    # Level 2: Checkpoint attention input (~60% memory)
+    # Level 3: Checkpoint more (~40% memory)
+    # Level 4: Full checkpointing (~20% memory)
+    checkpoint_factors = [1.0, 0.8, 0.6, 0.4, 0.2]
+    checkpoint_factor = checkpoint_factors[min(activation_checkpointing, 4)]
+
+    total_bytes *= checkpoint_factor
+
+    return gb_from_bytes(total_bytes)
+
+
+def calculate_overhead(
+    total_memory: float,
+    overhead_factor: float = 0.2,
+) -> float:
+    """Calculate additional memory overhead.
+
+    This accounts for CUDA context, fragmentation, temporary buffers, etc.
+
+    Args:
+        total_memory: Total calculated memory in GB
+        overhead_factor: Fraction to add for overhead (default 20%)
+
+    Returns:
+        Overhead memory in GB
+    """
+    return total_memory * overhead_factor
+
+
+def estimate_largest_layer_params(
+    hidden_size: int,
+    num_attention_heads: int,
+    intermediate_size: int | None = None,
+) -> int:
+    """Estimate the largest layer parameters for ZeRO-3 calculations.
+
+    The largest layer is typically the MLP layer or attention projection.
+
+    Args:
+        hidden_size: Hidden dimension size
+        num_attention_heads: Number of attention heads
+        intermediate_size: MLP intermediate size (default 4 * hidden_size)
+
+    Returns:
+        Estimated number of parameters in the largest layer
+    """
+    if intermediate_size is None:
+        intermediate_size = 4 * hidden_size
+
+    # MLP layer: hidden_size * intermediate_size * 2 (for up and down projections)
+    mlp_params = hidden_size * intermediate_size * 2
+
+    # Attention output projection: hidden_size * hidden_size
+    attn_params = hidden_size * hidden_size
+
+    return max(mlp_params, attn_params)
diff --git a/src/gpu_mem_calculator/core/models.py b/src/gpu_mem_calculator/core/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..53266999d8990cefbe1465e3e9379e4620e9f03a
--- /dev/null
+++ b/src/gpu_mem_calculator/core/models.py
@@ -0,0 +1,568 @@
+"""Data models for GPU memory calculation."""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Literal, cast
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic_core.core_schema import ValidationInfo as FieldValidationInfo
+
+
+class EngineType(str, Enum):
+    """Supported training engine types."""
+
+    PYTORCH_DDP = "pytorch_ddp"
+    DEEPSPEED = "deepspeed"
+    MEGATRON_LM = "megatron_lm"
+    FSDP = "fsdp"
+    MEGATRON_DEEPSPEED = "megatron_deepspeed"
+
+
+class InferenceEngineType(str, Enum):
+    """Supported inference engine types."""
+
+    HUGGINGFACE = "huggingface"
+    VLLM = "vllm"
+    TGI = "tgi"
+    TENSORRT_LLM = "tensorrt_llm"
+    TRTLLM = "trtllm"
+    SGLANG = "sglang"
+
+
+class OptimizerType(str, Enum):
+    """Supported optimizer types."""
+
+    ADAM = "adam"
+    ADAMW = "adamw"
+    SGD = "sgd"
+    ADAMW_8BIT = "adamw_8bit"
+
+
+class DType(str, Enum):
+    """Supported data types."""
+
+    FP32 = "fp32"
+    FP16 = "fp16"
+    BF16 = "bf16"
+    INT8 = "int8"
+    INT4 = "int4"
+
+
+class OffloadDevice(str, Enum):
+    """CPU offload options."""
+
+    NONE = "none"
+    CPU = "cpu"
+    NVME = "nvme"
+
+
+class ModelConfig(BaseModel):
+    """Model architecture configuration."""
+
+    name: str = Field(default="custom", description="Model name")
+    num_parameters: int = Field(gt=0, description="Total number of parameters")
+    num_layers: int = Field(gt=0, description="Number of transformer layers")
+    hidden_size: int = Field(gt=0, description="Hidden dimension size")
+    num_attention_heads: int = Field(gt=0, description="Number of attention heads")
+    vocab_size: int = Field(default=32000, gt=0, description="Vocabulary size")
+    max_seq_len: int = Field(default=2048, gt=0, description="Maximum sequence length")
+    largest_layer_params: int | None = Field(
+        default=None,
+        gt=0,
+        description="Largest layer parameters (auto-calculated if not provided)",
+    )
+
+    # MoE (Mixture of Experts) parameters
+    moe_enabled: bool = Field(default=False, description="Enable Mixture of Experts")
+    num_experts: int = Field(default=8, ge=1, description="Number of experts in MoE")
+    top_k: int = Field(default=2, ge=1, description="Number of experts activated per token (top-k)")
+    expert_intermediate_size: int | None = Field(
+        default=None,
+        gt=0,
+        description="Expert intermediate layer size (defaults to 4x hidden_size)",
+    )
+    shared_expert_intermediate_size: int | None = Field(
+        default=None,
+        gt=0,
+        description="Shared expert intermediate size (for models like GLM with shared experts)",
+    )
+
+    @model_validator(mode="after")
+    def calculate_largest_layer(self) -> ModelConfig:
+        """Calculate largest layer params if not provided."""
+        if self.largest_layer_params is not None:
+            return self
+        # Calculate it
+        hidden = self.hidden_size
+        moe_enabled = self.moe_enabled
+
+        if hidden and moe_enabled:
+            # For MoE: largest layer includes expert parameters
+            expert_intermediate = self.expert_intermediate_size or hidden * 4
+            self.largest_layer_params = int(hidden * expert_intermediate * 2)
+        elif hidden:
+            # Dense model: attention output + MLP
+            self.largest_layer_params = int(hidden * hidden * 4)
+        return self
+
+    @property
+    def effective_num_experts(self) -> int:
+        """Get effective number of experts (returns 1 if MoE disabled)."""
+        return self.num_experts if self.moe_enabled else 1
+
+    @property
+    def active_experts(self) -> int:
+        """Get number of active experts per token (top_k or 1 if dense)."""
+        return self.top_k if self.moe_enabled else 1
+
+
+class TrainingConfig(BaseModel):
+    """Training hyperparameters configuration."""
+
+    batch_size: int = Field(default=1, gt=0, description="Batch size per GPU")
+    gradient_accumulation_steps: int = Field(
+        default=1,
+        gt=0,
+        description="Gradient accumulation steps",
+    )
+    optimizer: OptimizerType = Field(default=OptimizerType.ADAMW, description="Optimizer type")
+    dtype: DType = Field(default=DType.BF16, description="Data type for training")
+    activation_checkpointing: int = Field(
+        default=0,
+        ge=0,
+        le=4,
+        description="Activation checkpointing level (0-4)",
+    )
+
+    @property
+    def effective_batch_size(self) -> int:
+        """Calculate effective batch size with gradient accumulation."""
+        return self.batch_size * self.gradient_accumulation_steps
+
+
+class ParallelismConfig(BaseModel):
+    """Parallelism configuration."""
+
+    tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
+    pipeline_parallel_size: int = Field(default=1, ge=1, description="Pipeline parallelism degree")
+    data_parallel_size: int = Field(default=1, ge=1, description="Data parallelism degree")
+    sequence_parallel: bool = Field(default=False, description="Enable sequence parallelism")
+
+    @property
+    def total_parallel_size(self) -> int:
+        """Calculate total parallelism degree."""
+        return self.tensor_parallel_size * self.pipeline_parallel_size * self.data_parallel_size
+
+
+class EngineConfig(BaseModel):
+    """Training engine specific configuration."""
+
+    type: EngineType = Field(default=EngineType.PYTORCH_DDP, description="Training engine type")
+    zero_stage: int | None = Field(
+        default=None,
+        ge=0,
+        le=3,
+        description="DeepSpeed ZeRO stage (only for DeepSpeed engine)",
+    )
+    offload_optimizer: OffloadDevice = Field(
+        default=OffloadDevice.NONE,
+        description="CPU offload for optimizer states",
+    )
+    offload_param: OffloadDevice = Field(
+        default=OffloadDevice.NONE,
+        description="CPU offload for parameters",
+    )
+    zero_init: bool = Field(
+        default=True,
+        description="Use ZeRO initialization (only for DeepSpeed ZeRO-3)",
+    )
+    sharding_strategy: Literal["no_shard", "shard_grad_op", "full_shard"] = Field(
+        default="full_shard",
+        description="FSDP sharding strategy",
+    )
+
+
+class GPUConfig(BaseModel):
+    """Hardware configuration."""
+
+    num_gpus: int = Field(default=1, ge=1, description="Number of GPUs")
+    gpu_memory_gb: float = Field(default=80.0, gt=0, description="GPU memory in GB")
+    total_gpu_memory_gb: float | None = Field(
+        default=None,
+        description="Total GPU memory (calculated if not provided)",
+    )
+
+    @field_validator("total_gpu_memory_gb")
+    @classmethod
+    def calculate_total_memory(cls, v: float | None, info: FieldValidationInfo) -> float | None:
+        """Calculate total GPU memory if not provided."""
+        if v is None:
+            num_gpus = cast(int, info.data.get("num_gpus", 1))
+            gpu_mem = cast(float, info.data.get("gpu_memory_gb", 80.0))
+            return num_gpus * gpu_mem
+        return v
+
+
+class InterconnectType(str, Enum):
+    """Multi-node interconnect types."""
+
+    INFINIBAND = "infiniband"
+    NVLINK = "nvlink"
+    ETHERNET_10G = "ethernet_10g"
+    ETHERNET_25G = "ethernet_25g"
+    ETHERNET_100G = "ethernet_100g"
+    ETHERNET_200G = "ethernet_200g"
+
+
+class NodeConfig(BaseModel):
+    """Multi-node configuration."""
+
+    num_nodes: int = Field(default=1, ge=1, description="Number of nodes")
+    gpus_per_node: int | None = Field(
+        default=None,
+        ge=1,
+        description="GPUs per node (calculated from num_gpus if not provided)",
+    )
+    interconnect_type: InterconnectType = Field(
+        default=InterconnectType.INFINIBAND,
+        description="Interconnect type between nodes",
+    )
+    interconnect_bandwidth_gbps: float | None = Field(
+        default=None,
+        gt=0,
+        description="Interconnect bandwidth in Gbps (default: auto from type)",
+    )
+
+    @field_validator("gpus_per_node")
+    @classmethod
+    def calculate_gpus_per_node(cls, v: int | None, info: FieldValidationInfo) -> int | None:
+        """Calculate GPUs per node if not provided."""
+        if v is None:
+            num_nodes = cast(int, info.data.get("num_nodes", 1))
+            num_gpus = cast(int, info.data.get("num_gpus", 1))
+            return max(1, num_gpus // num_nodes)
+        return v
+
+    def get_interconnect_bandwidth_gbps(self) -> float:
+        """Get interconnect bandwidth in Gbps.
+
+        Returns bandwidth from config or default based on interconnect type.
+        """
+        if self.interconnect_bandwidth_gbps:
+            return self.interconnect_bandwidth_gbps
+
+        # Default bandwidth values for each interconnect type
+        bandwidth_defaults = {
+            InterconnectType.INFINIBAND: 200.0,  # HDR200 InfiniBand
+            InterconnectType.NVLINK: 300.0,  # NVLink/NVSwitch
+            InterconnectType.ETHERNET_10G: 10.0,
+            InterconnectType.ETHERNET_25G: 25.0,
+            InterconnectType.ETHERNET_100G: 100.0,
+            InterconnectType.ETHERNET_200G: 200.0,
+        }
+        return bandwidth_defaults.get(self.interconnect_type, 100.0)
+
+    @property
+    def is_multi_node(self) -> bool:
+        """Check if this is a multi-node configuration."""
+        return self.num_nodes > 1
+
+
+class NetworkOverhead(BaseModel):
+    """Network communication overhead for multi-node training."""
+
+    allreduce_gb: float = Field(default=0.0, ge=0, description="AllReduce communication in GB")
+    allgather_gb: float = Field(default=0.0, ge=0, description="AllGather communication in GB")
+    reducescatter_gb: float = Field(
+        default=0.0, ge=0, description="ReduceScatter communication in GB"
+    )
+    point_to_point_gb: float = Field(
+        default=0.0, ge=0, description="Point-to-point communication in GB"
+    )
+    total_overhead_gb: float = Field(default=0.0, ge=0, description="Total network overhead in GB")
+    estimated_overhead_ms_per_step: float | None = Field(
+        default=None,
+        description="Estimated communication overhead per training step in milliseconds",
+    )
+
+
+class HybridParallelismConfig(BaseModel):
+    """Hybrid parallelism configuration for optimal multi-node scaling."""
+
+    auto_optimize: bool = Field(
+        default=False,
+        description="Automatically optimize parallelism strategy for given hardware",
+    )
+    target_gpu_utilization: float = Field(
+        default=0.85,
+        gt=0.0,
+        le=1.0,
+        description="Target GPU memory utilization (0.0-1.0)",
+    )
+    prefer_pipeline_parallel: bool = Field(
+        default=False,
+        description="Prefer pipeline parallelism over data parallel for multi-node",
+    )
+    max_pipeline_chunks: int | None = Field(
+        default=None,
+        ge=1,
+        description="Maximum number of pipeline chunks (virtual stages)",
+    )
+    enable_sequence_parallel: bool = Field(
+        default=True,
+        description="Enable sequence parallelism for long sequences",
+    )
+    sequence_parallel_threshold: int = Field(
+        default=4096,
+        ge=1,
+        description="Sequence length threshold for enabling sequence parallel",
+    )
+
+
+class MemoryBreakdown(BaseModel):
+    """Memory calculation result breakdown."""
+
+    model_config = ConfigDict(protected_namespaces=())
+
+    model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
+    gradients_gb: float = Field(ge=0, description="Gradients memory in GB")
+    optimizer_states_gb: float = Field(ge=0, description="Optimizer states memory in GB")
+    activations_gb: float = Field(ge=0, description="Activations memory in GB")
+    overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
+
+    @property
+    def total_memory_gb(self) -> float:
+        """Total memory in GB."""
+        return (
+            self.model_params_gb
+            + self.gradients_gb
+            + self.optimizer_states_gb
+            + self.activations_gb
+            + self.overhead_gb
+        )
+
+
+class MemoryResult(BaseModel):
+    """Complete memory calculation result."""
+
+    total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
+    total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
+    cpu_memory_gb: float = Field(default=0.0, ge=0, description="CPU memory required in GB")
+    breakdown: MemoryBreakdown = Field(description="Memory breakdown by component")
+    network_overhead: NetworkOverhead | None = Field(
+        default=None,
+        description="Network communication overhead for multi-node training",
+    )
+    fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
+    memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
+    recommended_batch_size: int | None = Field(
+        default=None,
+        description="Recommended batch size if current doesn't fit",
+    )
+    multi_node_info: dict | None = Field(
+        default=None,
+        description="Additional multi-node configuration info",
+    )
+
+
+class KVCacheQuantization(str, Enum):
+    """KV cache quantization options."""
+
+    NONE = "none"
+    INT8 = "int8"
+    FP8 = "fp8"
+    INT4 = "int4"
+
+
+class InferenceMemoryBreakdown(BaseModel):
+    """Memory breakdown for inference workloads."""
+
+    model_config = ConfigDict(protected_namespaces=())
+
+    model_params_gb: float = Field(ge=0, description="Model parameters memory in GB")
+    kv_cache_gb: float = Field(ge=0, description="KV cache memory in GB")
+    activations_gb: float = Field(ge=0, description="Activation memory in GB")
+    overhead_gb: float = Field(default=0.0, ge=0, description="Additional overhead in GB")
+
+    @property
+    def total_memory_gb(self) -> float:
+        """Total memory in GB."""
+        return self.model_params_gb + self.kv_cache_gb + self.activations_gb + self.overhead_gb
+
+
+class InferenceConfig(BaseModel):
+    """Inference-specific configuration."""
+
+    batch_size: int = Field(default=1, gt=0, description="Batch size for inference")
+    max_seq_len: int | None = Field(
+        default=None,
+        gt=0,
+        description="Override max sequence length for inference (default: use model config)",
+    )
+    kv_cache_quantization: KVCacheQuantization = Field(
+        default=KVCacheQuantization.NONE,
+        description="KV cache quantization type",
+    )
+    use_kv_cache: bool = Field(default=True, description="Enable KV cache for generation")
+    tensor_parallel_size: int = Field(default=1, ge=1, description="Tensor parallelism degree")
+    enable_streaming: bool = Field(default=False, description="Enable streaming inference")
+
+    # Common inference options
+    gpu_memory_utilization: float = Field(
+        default=0.9,
+        gt=0.0,
+        le=1.0,
+        description="GPU memory utilization target (0.0-1.0)",
+    )
+
+    # TGI-specific options
+    max_total_tokens: int | None = Field(
+        default=None,
+        gt=0,
+        description="TGI: Maximum total tokens (input + output) - defines memory budget",
+    )
+    max_input_tokens: int | None = Field(
+        default=None,
+        gt=0,
+        description="TGI: Maximum input tokens",
+    )
+    max_batch_total_tokens: int | None = Field(
+        default=None,
+        gt=0,
+        description="TGI: Maximum total tokens across all batches",
+    )
+    tgi_quantize: Literal[
+        "none",
+        "awq",
+        "eetq",
+        "exl2",
+        "gptq",
+        "marlin",
+        "bitsandbytes",
+        "bitsandbytes-nf4",
+        "bitsandbytes-fp4",
+        "fp8",
+    ] = Field(
+        default="none",
+        description="TGI: Weight quantization method",
+    )
+    tgi_dtype: Literal["float16", "bfloat16"] = Field(
+        default="bfloat16",
+        description="TGI: Data type for inference",
+    )
+    sharded: bool = Field(default=False, description="TGI: Enable sharded inference")
+    num_shard: int | None = Field(
+        default=None,
+        ge=1,
+        description="TGI: Number of shards for sharded inference",
+    )
+
+    # vLLM-specific options
+    block_size: int | None = Field(
+        default=None,
+        ge=1,
+        description="vLLM: Block size for KV cache management (default: 16)",
+    )
+    swap_space_gb: float = Field(default=0.0, ge=0.0, description="vLLM: CPU swap space in GB")
+    enable_prefix_caching: bool = Field(default=False, description="vLLM: Enable prefix caching")
+    enforce_eager: bool = Field(
+        default=False,
+        description="vLLM: Enable eager mode (disable CUDA graph)",
+    )
+    max_num_batched_tokens: int | None = Field(
+        default=None,
+        gt=0,
+        description="vLLM: Maximum number of batched tokens",
+    )
+    max_num_seqs: int | None = Field(
+        default=None,
+        gt=0,
+        description="vLLM: Maximum number of sequences in a batch",
+    )
+    vllm_quantization: Literal["none", "awq", "gptq", "squeezellm", "fp8"] = Field(
+        default="none",
+        description="vLLM: Weight quantization method",
+    )
+
+    # TensorRT-LLM-specific options
+    trt_max_batch_size: int | None = Field(
+        default=None,
+        gt=0,
+        description="TensorRT-LLM: Maximum batch size",
+    )
+    trt_max_input_len: int | None = Field(
+        default=None,
+        gt=0,
+        description="TensorRT-LLM: Maximum input length",
+    )
+    trt_max_seq_len: int | None = Field(
+        default=None,
+        gt=0,
+        description="TensorRT-LLM: Maximum sequence length",
+    )
+    trt_max_beam_width: int | None = Field(
+        default=None,
+        ge=1,
+        description="TensorRT-LLM: Maximum beam width for beam search",
+    )
+
+    # SGLang-specific options
+    chunk_size: int | None = Field(
+        default=None,
+        ge=1,
+        description="SGLang: Prefill chunk size for long contexts (default: 8192)",
+    )
+    max_running_requests: int | None = Field(
+        default=None,
+        ge=1,
+        description="SGLang: Maximum number of concurrent requests",
+    )
+    disable_radix_cache: bool = Field(
+        default=False,
+        description="SGLang: Disable RadixAttention cache (for debugging)",
+    )
+    enable_p2p: bool = Field(
+        default=False,
+        description="SGLang: Enable P2P attention for multi-GPU",
+    )
+    disable_custom_all_reduce: bool = Field(
+        default=False,
+        description="SGLang: Disable custom all-reduce kernel",
+    )
+    attention_backend: Literal["flashinfer", "triton", "torch"] = Field(
+        default="flashinfer",
+        description="SGLang: Attention backend implementation",
+    )
+    enable_torch_compile: bool = Field(
+        default=False,
+        description="SGLang: Enable torch.compile for model optimization",
+    )
+    radix_cache_max_seq_len: int | None = Field(
+        default=None,
+        gt=0,
+        description="SGLang: Maximum sequence length for RadixCache",
+    )
+    speculative_algo: Literal["default", "medusa", "eagle"] = Field(
+        default="default",
+        description="SGLang: Speculative decoding algorithm",
+    )
+    multi_lora_enabled: bool = Field(default=False, description="SGLang: Enable multi-LoRA serving")
+
+
+class InferenceMemoryResult(BaseModel):
+    """Inference memory calculation result."""
+
+    total_memory_per_gpu_gb: float = Field(ge=0, description="Total memory per GPU in GB")
+    total_memory_all_gpus_gb: float = Field(ge=0, description="Total memory across all GPUs in GB")
+    breakdown: InferenceMemoryBreakdown = Field(description="Memory breakdown by component")
+    fits_on_gpu: bool = Field(description="Whether the config fits on available GPU")
+    memory_utilization_percent: float = Field(ge=0, description="Memory utilization percentage")
+    max_supported_batch_size: int | None = Field(
+        default=None,
+        description="Maximum batch size that fits in GPU memory",
+    )
+    estimated_throughput_tokens_per_sec: float | None = Field(
+        default=None,
+        description="Estimated throughput in tokens/second",
+    )
diff --git a/src/gpu_mem_calculator/core/multinode.py b/src/gpu_mem_calculator/core/multinode.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b6b02fd333438fb5eeb65154c06ac2da023b89
--- /dev/null
+++ b/src/gpu_mem_calculator/core/multinode.py
@@ -0,0 +1,308 @@
+"""Multi-node training calculator.
+
+Handles network communication overhead calculation and hybrid
+parallelism optimization for multi-node training configurations.
+"""
+
+from gpu_mem_calculator.core.models import (
+    EngineConfig,
+    EngineType,
+    HybridParallelismConfig,
+    ModelConfig,
+    NetworkOverhead,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+
+class MultiNodeCalculator:
+    """Calculator for multi-node training overhead and optimization.
+
+    This class provides:
+    - Network communication overhead estimation
+    - Hybrid parallelism strategy optimization
+    - Multi-node performance modeling
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig,
+        node_config: NodeConfig,
+        engine_config: EngineConfig,
+    ) -> None:
+        """Initialize the multi-node calculator.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings
+            node_config: Multi-node hardware configuration
+            engine_config: Training engine configuration
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config
+        self.node_config = node_config
+        self.engine_config = engine_config
+
+    def calculate_network_overhead(self) -> NetworkOverhead:
+        """Calculate network communication overhead for multi-node training.
+
+        Estimates communication overhead for different collective operations
+        based on model size, parallelism strategy, and interconnect bandwidth.
+
+        Returns:
+            NetworkOverhead with detailed breakdown
+        """
+        if not self.node_config.is_multi_node:
+            return NetworkOverhead()
+
+        # Get model size in bytes
+        model_params = self.model_config.num_parameters
+        dtype_bytes = self._get_dtype_bytes()
+        model_size_bytes = int(model_params * dtype_bytes)
+
+        # Calculate communication for each collective operation
+        allreduce_gb = self._calculate_allreduce_overhead(model_size_bytes)
+        allgather_gb = self._calculate_allgather_overhead(model_size_bytes)
+        reducescatter_gb = self._calculate_reducescatter_overhead(model_size_bytes)
+        point_to_point_gb = self._calculate_pipeline_overhead(model_size_bytes)
+
+        total_overhead_gb = allreduce_gb + allgather_gb + reducescatter_gb + point_to_point_gb
+
+        # Estimate time overhead per step
+        overhead_ms = self._estimate_communication_time_ms(total_overhead_gb)
+
+        return NetworkOverhead(
+            allreduce_gb=allreduce_gb,
+            allgather_gb=allgather_gb,
+            reducescatter_gb=reducescatter_gb,
+            point_to_point_gb=point_to_point_gb,
+            total_overhead_gb=total_overhead_gb,
+            estimated_overhead_ms_per_step=overhead_ms,
+        )
+
+    def optimize_hybrid_parallelism(
+        self,
+        hybrid_config: HybridParallelismConfig,
+    ) -> ParallelismConfig:
+        """Optimize hybrid parallelism strategy for multi-node training.
+
+        Analyzes the hardware configuration and model characteristics
+        to recommend optimal parallelism degrees.
+
+        Args:
+            hybrid_config: Hybrid parallelism configuration and preferences
+
+        Returns:
+            Optimized ParallelismConfig
+        """
+        if not hybrid_config.auto_optimize:
+            return self.parallelism_config
+
+        num_nodes = self.node_config.num_nodes
+        gpus_per_node = self.node_config.gpus_per_node or 1
+        total_gpus = num_nodes * gpus_per_node
+
+        seq_len = self.model_config.max_seq_len
+
+        # Determine optimal parallelism strategy
+        if seq_len >= hybrid_config.sequence_parallel_threshold:
+            # Enable sequence parallel for long sequences
+            enable_sp = True
+        else:
+            enable_sp = hybrid_config.enable_sequence_parallel
+
+        # Calculate parallelism degrees
+        if hybrid_config.prefer_pipeline_parallel and num_nodes > 1:
+            # Prefer pipeline parallel across nodes
+            pp_size = int(min(num_nodes, 8))  # Limit pipeline stages
+            tp_size = int(min(gpus_per_node, 8))  # Tensor parallel within node
+            dp_size = int(total_gpus // (pp_size * tp_size))
+        else:
+            # Default: maximize data parallel
+            tp_size = 1
+            pp_size = 1
+            dp_size = int(total_gpus)
+
+        # Ensure all values are at least 1
+        tp_size = max(1, tp_size)
+        pp_size = max(1, pp_size)
+        dp_size = max(1, dp_size)
+
+        return ParallelismConfig(
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            data_parallel_size=dp_size,
+            sequence_parallel=enable_sp,
+        )
+
+    def _calculate_allreduce_overhead(self, model_size_bytes: int) -> float:
+        """Calculate AllReduce communication overhead.
+
+        AllReduce is used for gradient averaging in data parallel training.
+        Algorithm: Ring AllReduce with O(2 * model_size) communication.
+
+        Args:
+            model_size_bytes: Model size in bytes
+
+        Returns:
+            Communication volume in GB
+        """
+        # Ring AllReduce: each GPU sends/receives 2 * model_size / num_gpus
+        # But we need the total across the network
+
+        # For gradient averaging: 2 * model_size (send + receive)
+        allreduce_bytes = 2 * model_size_bytes
+
+        # Adjust for collective operation efficiency
+        # In multi-node, cross-node traffic is the bottleneck
+        if self.node_config.is_multi_node:
+            # Only cross-node traffic matters
+            allreduce_bytes = int(allreduce_bytes / self.node_config.num_nodes)
+
+        return allreduce_bytes / (1024**3)
+
+    def _calculate_allgather_overhead(self, model_size_bytes: int) -> float:
+        """Calculate AllGather communication overhead.
+
+        AllGather is used in ZeRO-3 and tensor parallel for parameter gathering.
+
+        Args:
+            model_size_bytes: Model size in bytes
+
+        Returns:
+            Communication volume in GB
+        """
+        # AllGather: (num_gpus - 1) * model_size / num_gpus per GPU
+        # But for ZeRO-3, we gather all parameters
+        is_zero3 = (
+            self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 3
+        )
+
+        if is_zero3:
+            # ZeRO-3 gathers all parameters during forward pass
+            allgather_bytes = model_size_bytes
+        else:
+            # Standard allgather for tensor parallel
+            allgather_bytes = int(model_size_bytes / self.parallelism_config.tensor_parallel_size)
+
+        # Adjust for multi-node
+        if self.node_config.is_multi_node:
+            allgather_bytes = int(allgather_bytes / self.node_config.num_nodes)
+
+        return allgather_bytes / (1024**3)
+
+    def _calculate_reducescatter_overhead(self, model_size_bytes: int) -> float:
+        """Calculate ReduceScatter communication overhead.
+
+        ReduceScatter is used in ZeRO-2 and gradient sharding.
+
+        Args:
+            model_size_bytes: Model size in bytes
+
+        Returns:
+            Communication volume in GB
+        """
+        is_zero2 = (
+            self.engine_config.type == EngineType.DEEPSPEED and self.engine_config.zero_stage == 2
+        )
+
+        if is_zero2:
+            # ZeRO-2 scatters gradients
+            reducescatter_bytes = model_size_bytes
+        else:
+            # Standard reducescatter
+            reducescatter_bytes = int(model_size_bytes / self.parallelism_config.data_parallel_size)
+
+        # Adjust for multi-node
+        if self.node_config.is_multi_node:
+            reducescatter_bytes = int(reducescatter_bytes / self.node_config.num_nodes)
+
+        return reducescatter_bytes / (1024**3)
+
+    def _calculate_pipeline_overhead(self, model_size_bytes: int) -> float:
+        """Calculate pipeline parallel communication overhead.
+
+        Point-to-point communication between pipeline stages.
+
+        Args:
+            model_size_bytes: Model size in bytes
+
+        Returns:
+            Communication volume in GB
+        """
+        if self.parallelism_config.pipeline_parallel_size <= 1:
+            return 0.0
+
+        # Pipeline parallel sends activations between stages
+        # Approximate as layer activations
+        hidden_size = self.model_config.hidden_size
+        seq_len = self.model_config.max_seq_len
+        batch_size = self.training_config.batch_size
+        num_layers = self.model_config.num_layers
+
+        # Activation size per layer
+        activation_bytes = batch_size * seq_len * hidden_size * 2  # FP16/BF16
+
+        # Number of microbatches determines communication frequency
+        # For simplicity, assume num_stages communications per step
+        pp_size = self.parallelism_config.pipeline_parallel_size
+        pipeline_comm_bytes = activation_bytes * (num_layers // pp_size)
+
+        # Adjust for multi-node
+        if self.node_config.is_multi_node:
+            pipeline_comm_bytes = int(pipeline_comm_bytes / self.node_config.num_nodes)
+
+        return pipeline_comm_bytes / (1024**3)
+
+    def _estimate_communication_time_ms(self, total_gb: float) -> float:
+        """Estimate communication time per training step in milliseconds.
+
+        Args:
+            total_gb: Total communication volume in GB
+
+        Returns:
+            Estimated time in milliseconds
+        """
+        if total_gb == 0:
+            return 0.0
+
+        # Get bandwidth in GB/s
+        bandwidth_gbps = self.node_config.get_interconnect_bandwidth_gbps()
+        bandwidth_gbps_per_sec = bandwidth_gbps / 8  # Convert to GB/s
+
+        # Basic time = size / bandwidth
+        time_seconds = total_gb / bandwidth_gbps_per_sec
+
+        # Add latency overhead for collective operations
+        # Typical latency: 10-50 microseconds per hop
+        num_nodes = self.node_config.num_nodes
+        latency_overhead = num_nodes * 0.00005  # 50 microseconds per node
+
+        # Network efficiency factor (not 100% efficient)
+        efficiency = 0.85
+
+        total_time_seconds = (time_seconds / efficiency) + latency_overhead
+
+        return total_time_seconds * 1000  # Convert to ms
+
+    def _get_dtype_bytes(self) -> float:
+        """Get bytes per element based on dtype."""
+        dtype_map = {
+            "fp32": 4,
+            "fp16": 2,
+            "bf16": 2,
+            "int8": 1,
+            "int4": 0.5,
+        }
+        return dtype_map.get(self.training_config.dtype.value, 2)
+
+    def _calculate_model_size_gb(self) -> float:
+        """Calculate model size in GB."""
+        dtype_bytes = self._get_dtype_bytes()
+        model_size_bytes = self.model_config.num_parameters * dtype_bytes
+        return model_size_bytes / (1024**3)
diff --git a/src/gpu_mem_calculator/engines/__init__.py b/src/gpu_mem_calculator/engines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2fe2071526ef3b72db402629ec275ce4b38cbb
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/__init__.py
@@ -0,0 +1,16 @@
+"""Training engine implementations."""
+
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.engines.deepspeed import DeepSpeedEngine
+from gpu_mem_calculator.engines.fsdp import FSDPEngine
+from gpu_mem_calculator.engines.megatron import MegatronDeepSpeedEngine, MegatronLMEngine
+from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
+
+__all__ = [
+    "BaseEngine",
+    "PyTorchDDPEngine",
+    "DeepSpeedEngine",
+    "MegatronLMEngine",
+    "MegatronDeepSpeedEngine",
+    "FSDPEngine",
+]
diff --git a/src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d77e30e547c6b15aa80dbb5e0dc77611ec4832b7
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb86ec0de44861e075692e6415234cd7466bc08c
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/base.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d172ff5cf1b1f65432e54aa0a46b1975561890bf
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/deepspeed.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da56619169431d8b808eb6a2dcd24573239ba03c
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/fsdp.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87716550d68026c49d48b1ffa1ca58e5c90f58ef
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/megatron.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc b/src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8444805000d819888fb32df24f13d2061a3b0f0d
Binary files /dev/null and b/src/gpu_mem_calculator/engines/__pycache__/pytorch.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/engines/base.py b/src/gpu_mem_calculator/engines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12e4e7641e25e8adfe649c6172710b8363bff9d
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/base.py
@@ -0,0 +1,220 @@
+"""Base class for training engine implementations."""
+
+from abc import ABC, abstractmethod
+
+from gpu_mem_calculator.core.models import (
+    EngineConfig,
+    GPUConfig,
+    MemoryBreakdown,
+    MemoryResult,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+
+class BaseEngine(ABC):
+    """Abstract base class for training engine memory calculation.
+
+    Each training engine (PyTorch DDP, DeepSpeed, Megatron-LM, etc.)
+    should implement this interface to provide engine-specific
+    memory calculations.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig,
+        engine_config: EngineConfig,
+        gpu_config: GPUConfig,
+        node_config: NodeConfig | None = None,
+    ) -> None:
+        """Initialize the engine with configuration.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings
+            engine_config: Engine-specific configuration
+            gpu_config: Hardware configuration
+            node_config: Multi-node configuration (optional)
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config
+        self.engine_config = engine_config
+        self.gpu_config = gpu_config
+        self.node_config = node_config or NodeConfig()
+
+    @abstractmethod
+    def calculate_memory(self) -> MemoryResult:
+        """Calculate memory requirements for this engine.
+
+        This is the main method that should be implemented by each engine.
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        pass
+
+    def _check_feasibility(
+        self,
+        total_memory_per_gpu: float,
+    ) -> tuple[bool, float, int | None]:
+        """Check if the configuration fits on available GPU.
+
+        Args:
+            total_memory_per_gpu: Total memory required per GPU
+
+        Returns:
+            Tuple of (fits_on_gpu, utilization_percent, recommended_batch_size)
+        """
+        available_memory = self.gpu_config.gpu_memory_gb
+        utilization_percent = (total_memory_per_gpu / available_memory) * 100
+
+        fits_on_gpu = total_memory_per_gpu <= available_memory
+
+        # If doesn't fit, suggest a smaller batch size
+        recommended_batch_size = None
+        if not fits_on_gpu:
+            # Simple heuristic: scale batch size inversely with memory excess
+            excess_factor = total_memory_per_gpu / available_memory
+            recommended_batch_size = max(1, int(self.training_config.batch_size / excess_factor))
+
+        return fits_on_gpu, utilization_percent, recommended_batch_size
+
+    def _create_result(
+        self,
+        breakdown: MemoryBreakdown,
+        cpu_memory_gb: float = 0.0,
+    ) -> MemoryResult:
+        """Create a MemoryResult from breakdown.
+
+        Args:
+            breakdown: Memory breakdown by component
+            cpu_memory_gb: CPU memory required (default 0)
+
+        Returns:
+            Complete MemoryResult
+        """
+        total_memory_per_gpu = breakdown.total_memory_gb
+        total_memory_all_gpus = total_memory_per_gpu * self.gpu_config.num_gpus
+
+        fits_on_gpu, utilization_percent, recommended_batch_size = self._check_feasibility(
+            total_memory_per_gpu
+        )
+
+        # Calculate network overhead for multi-node configurations
+        network_overhead = None
+        multi_node_info = None
+        if self.node_config.is_multi_node:
+            from gpu_mem_calculator.core.multinode import MultiNodeCalculator
+
+            multinode_calc = MultiNodeCalculator(
+                model_config=self.model_config,
+                training_config=self.training_config,
+                parallelism_config=self.parallelism_config,
+                node_config=self.node_config,
+                engine_config=self.engine_config,
+            )
+            network_overhead = multinode_calc.calculate_network_overhead()
+
+            # Add multi-node info
+            multi_node_info = {
+                "num_nodes": self.node_config.num_nodes,
+                "gpus_per_node": self.node_config.gpus_per_node,
+                "interconnect_type": self.node_config.interconnect_type.value,
+                "interconnect_bandwidth_gbps": self.node_config.get_interconnect_bandwidth_gbps(),
+            }
+
+        return MemoryResult(
+            total_memory_per_gpu_gb=total_memory_per_gpu,
+            total_memory_all_gpus_gb=total_memory_all_gpus,
+            cpu_memory_gb=cpu_memory_gb,
+            breakdown=breakdown,
+            network_overhead=network_overhead,
+            fits_on_gpu=fits_on_gpu,
+            memory_utilization_percent=utilization_percent,
+            recommended_batch_size=recommended_batch_size,
+            multi_node_info=multi_node_info,
+        )
+
+    @property
+    def effective_batch_size(self) -> int:
+        """Calculate effective batch size with gradient accumulation."""
+        return (
+            self.training_config.batch_size
+            * self.training_config.gradient_accumulation_steps
+            * self.parallelism_config.data_parallel_size
+        )
+
+    @property
+    def total_num_gpus(self) -> int:
+        """Get total number of GPUs."""
+        return self.gpu_config.num_gpus
+
+    @property
+    def num_gpus_per_model(self) -> int:
+        """Get number of GPUs per model replica.
+
+        This is tensor_parallel * pipeline_parallel for distributed training.
+        """
+        return (
+            self.parallelism_config.tensor_parallel_size
+            * self.parallelism_config.pipeline_parallel_size
+        )
+
+    def calculate_moe_activation_multiplier(self) -> float:
+        """Calculate activation memory multiplier for MoE models.
+
+        For MoE models, activation memory depends on top_k (active experts per token)
+        rather than total number of experts. This is because only top_k experts
+        are activated per token during forward/backward pass.
+
+        Returns:
+            Multiplier for activation memory (1.0 for dense models, <1 for MoE)
+        """
+        if not self.model_config.moe_enabled:
+            return 1.0
+
+        # For MoE: only top_k experts are active per token
+        # Activation memory scales with active_experts / total_experts
+        # But we also have router overhead and gating network activations
+
+        num_experts = self.model_config.num_experts
+        top_k = self.model_config.top_k
+
+        # Base activation ratio: only top_k experts active
+        activation_ratio = top_k / num_experts
+
+        # Add router overhead (typically 5-15% extra for gating)
+        router_overhead = 0.1
+
+        # For models with shared experts (like GLM), adjust accordingly
+        if self.model_config.shared_expert_intermediate_size:
+            # Shared expert is always active, so add its contribution
+            # This is a simplified approximation
+            activation_ratio = activation_ratio + (1.0 / num_experts)
+
+        return min(1.0, activation_ratio + router_overhead)
+
+    def calculate_moe_parameter_ratio(self) -> float:
+        """Calculate effective parameter ratio for MoE models.
+
+        For MoE models, only top_k experts are used during forward pass,
+        but all expert parameters are stored in memory.
+
+        Returns:
+            Ratio of active parameters to total parameters (for memory estimation)
+        """
+        if not self.model_config.moe_enabled:
+            return 1.0
+
+        # All expert parameters are stored, but only top_k are used per token
+        # For gradient calculation, we need gradients for all experts
+        # So parameter storage = 1.0 (all params stored)
+        # But we can use this for inference-specific calculations
+
+        return 1.0  # All parameters stored in memory
diff --git a/src/gpu_mem_calculator/engines/deepspeed.py b/src/gpu_mem_calculator/engines/deepspeed.py
new file mode 100644
index 0000000000000000000000000000000000000000..40399064bc2aa86f636047593517e333eae48369
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/deepspeed.py
@@ -0,0 +1,316 @@
+"""DeepSpeed ZeRO engine implementation.
+
+Implements memory calculations for DeepSpeed ZeRO stages 1, 2, and 3.
+Based on: https://deepspeed.readthedocs.io/en/latest/memory.html
+"""
+
+from gpu_mem_calculator.core.formulas import (
+    calculate_activation_memory,
+    calculate_overhead,
+    estimate_largest_layer_params,
+)
+from gpu_mem_calculator.core.models import (
+    MemoryBreakdown,
+    MemoryResult,
+    OffloadDevice,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+
+class DeepSpeedEngine(BaseEngine):
+    """DeepSpeed ZeRO memory calculation.
+
+    Implements ZeRO stages:
+    - ZeRO-1: Shard optimizer states
+    - ZeRO-2: Shard optimizer states + gradients
+    - ZeRO-3: Shard optimizer states + gradients + parameters
+    """
+
+    def calculate_memory(self) -> MemoryResult:
+        """Calculate memory requirements for DeepSpeed ZeRO training.
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        zero_stage = self.engine_config.zero_stage or 0
+        offload_optimizer = self.engine_config.offload_optimizer
+        offload_param = self.engine_config.offload_param
+
+        # Get largest layer params for ZeRO-3
+        if self.model_config.largest_layer_params is None:
+            largest_layer_params = estimate_largest_layer_params(
+                hidden_size=self.model_config.hidden_size,
+                num_attention_heads=self.model_config.num_attention_heads,
+            )
+        else:
+            largest_layer_params = self.model_config.largest_layer_params
+
+        match zero_stage:
+            case 0:
+                return self._calculate_zero0()
+            case 1:
+                return self._calculate_zero1(offload_optimizer)
+            case 2:
+                return self._calculate_zero2(offload_optimizer)
+            case 3:
+                return self._calculate_zero3(
+                    offload_optimizer,
+                    offload_param,
+                    largest_layer_params,
+                )
+            case _:
+                # Default to ZeRO-2
+                return self._calculate_zero2(offload_optimizer)
+
+    def _calculate_zero0(self) -> MemoryResult:
+        """Calculate memory for ZeRO-0 (disabled, same as PyTorch DDP)."""
+        # Import here to avoid circular dependency
+        from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
+
+        # ZeRO-0 is the same as PyTorch DDP
+        ddp_engine = PyTorchDDPEngine(
+            model_config=self.model_config,
+            training_config=self.training_config,
+            parallelism_config=self.parallelism_config,
+            engine_config=self.engine_config,
+            gpu_config=self.gpu_config,
+        )
+        return ddp_engine.calculate_memory()
+
+    def _calculate_zero1(
+        self,
+        offload_optimizer: OffloadDevice,
+    ) -> MemoryResult:
+        """Calculate memory for ZeRO-1 (shard optimizer states).
+
+        ZeRO-1 shards optimizer states across data parallel GPUs.
+
+        Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+        Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+
+        Memory formula:
+        - offload_optimizer=cpu: 2 * params (fp16 params only on GPU)
+        - offload_optimizer=none: 4 * params (fp16 params + fp32 params) +
+          12 * params / num_gpus (sharded optimizer states)
+
+        Note: Optimizer states = 12 bytes per param for Adam/AdamW
+        - 4 bytes: FP32 parameter copy
+        - 4 bytes: Momentum (FP32)
+        - 4 bytes: Variance (FP32)
+        """
+        num_params = self.model_config.num_parameters
+        num_gpus = self.total_num_gpus
+
+        # Model parameters (fp16/bf16 on GPU)
+        model_params_gb = gb_from_bytes(num_params * 2)  # FP16/BF16 = 2 bytes
+
+        # Gradients (fp16 on GPU)
+        gradients_gb = gb_from_bytes(num_params * 2)
+
+        # Optimizer states (sharded across GPUs, possibly offloaded to CPU)
+        # 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
+        if offload_optimizer == OffloadDevice.CPU:
+            # Offloaded to CPU, minimal GPU memory for optimizer
+            optimizer_gb = 0.0
+            cpu_memory_gb = gb_from_bytes(num_params * 12)  # Full optimizer on CPU
+        else:
+            # Sharded across GPUs: 12 bytes / num_gpus per GPU
+            optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
+            cpu_memory_gb = 0.0
+
+        # Activations (same as baseline)
+        activations_gb = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Overhead
+        base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+        overhead_gb = calculate_overhead(base_memory)
+
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown, cpu_memory_gb)
+
+    def _calculate_zero2(
+        self,
+        offload_optimizer: OffloadDevice,
+    ) -> MemoryResult:
+        """Calculate memory for ZeRO-2 (shard optimizer + gradients).
+
+        ZeRO-2 shards optimizer states AND gradients across data parallel GPUs.
+
+        Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+        Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+
+        Memory formula:
+        - offload_optimizer=cpu: 2 * params (fp16 params) +
+          (2 * params / num_gpus) (sharded fp16 grads)
+        - offload_optimizer=none: 2 * params (fp16 params) +
+          2 * params / num_gpus (sharded fp16 grads) +
+          12 * params / num_gpus (sharded optimizer states)
+
+        Note: Unlike ZeRO-1, ZeRO-2 shards gradients across GPUs
+        """
+        num_params = self.model_config.num_parameters
+        num_gpus = self.total_num_gpus
+
+        # Model parameters (fp16/bf16 on GPU) - NOT sharded in ZeRO-2
+        model_params_gb = gb_from_bytes(num_params * 2)  # FP16/BF16 = 2 bytes
+
+        # Gradients (fp16 on GPU) - SHARDED in ZeRO-2
+        gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+        # Optimizer states (sharded across GPUs, possibly offloaded to CPU)
+        # 12 bytes per param for Adam/AdamW (FP32 params copy + momentum + variance)
+        if offload_optimizer == OffloadDevice.CPU:
+            # Offloaded to CPU, minimal GPU memory for optimizer
+            optimizer_gb = 0.0
+            cpu_memory_gb = gb_from_bytes(num_params * 12)  # Full optimizer on CPU
+        else:
+            # Sharded across GPUs: 12 bytes / num_gpus per GPU
+            optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)
+            cpu_memory_gb = 0.0
+
+        # Activations (same as baseline)
+        activations_gb = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Overhead
+        base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+        overhead_gb = calculate_overhead(base_memory)
+
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown, cpu_memory_gb)
+
+    def _calculate_zero3(
+        self,
+        offload_optimizer: OffloadDevice,
+        offload_param: OffloadDevice,
+        largest_layer_params: int,
+    ) -> MemoryResult:
+        """Calculate memory for ZeRO-3 (shard params + optimizer + gradients).
+
+        ZeRO-3 shards everything across GPUs.
+
+        Reference: https://deepspeed.readthedocs.io/en/latest/memory.html
+        Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+
+        Memory formula:
+        - largest_layer_memory = 4 * largest_layer_params (fp16 params + fp16 grads)
+
+        Case 1 (no offload):
+          largest_layer_memory + 18 * params / num_gpus
+          (where 18 = 16 bytes optimizer states + 2 bytes fp16 params)
+
+        Case 2 (param + optimizer offload to CPU):
+          largest_layer_memory (main limit is CPU RAM)
+
+        Case 3 (optimizer offload to CPU only):
+          largest_layer_memory + 2 * params / num_gpus
+
+        Note: Optimizer states = 16 bytes per param for Adam/AdamW (FP32)
+        - 4 bytes: FP32 parameter copy
+        - 4 bytes: Momentum (FP32)
+        - 4 bytes: Variance (FP32)
+        - 4 bytes: Gradient (FP32 copy for optimizer update)
+        """
+        num_params = self.model_config.num_parameters
+        num_gpus = self.total_num_gpus
+
+        # Largest layer memory (fp16 params + fp16 grads gathered on one GPU)
+        largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
+
+        # Calculate memory based on offload configuration
+        if offload_param == OffloadDevice.CPU and offload_optimizer == OffloadDevice.CPU:
+            # Case 2: Both params and optimizer offloaded to CPU
+            # Only need largest layer on GPU at a time
+            params_per_gpu_gb = 0.0
+            gradients_per_gpu_gb = 0.0
+            optimizer_gb = 0.0
+            cpu_memory_gb = gb_from_bytes(num_params * 18)  # Full model on CPU
+        elif offload_optimizer == OffloadDevice.CPU:
+            # Case 3: Only optimizer offloaded to CPU
+            params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+            gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+            optimizer_gb = 0.0
+            cpu_memory_gb = gb_from_bytes(num_params * 16)  # Optimizer on CPU
+        else:
+            # Case 1: No offload
+            params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+            gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+            optimizer_gb = gb_from_bytes((num_params * 16) / num_gpus)  # FP32
+            cpu_memory_gb = 0.0
+
+        # Model params = largest layer for ZeRO-3
+        model_params_gb = largest_layer_memory_gb
+
+        # Activations
+        activations_gb = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Overhead
+        base_memory = (
+            model_params_gb
+            + params_per_gpu_gb
+            + gradients_per_gpu_gb
+            + optimizer_gb
+            + activations_gb
+        )
+        overhead_gb = calculate_overhead(base_memory)
+
+        # For ZeRO-3, we combine params/gradients/optimizer into model_params in breakdown
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb + params_per_gpu_gb,
+            gradients_gb=gradients_per_gpu_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown, cpu_memory_gb)
diff --git a/src/gpu_mem_calculator/engines/fsdp.py b/src/gpu_mem_calculator/engines/fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a9a71a0b819b2f05fe1cfa593a4e88e59c830c
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/fsdp.py
@@ -0,0 +1,213 @@
+"""FSDP (Fully Sharded Data Parallel) engine implementation.
+
+Implements memory calculations for PyTorch FSDP.
+
+Reference: https://pytorch.org/docs/stable/fsdp.html
+Reference: https://blog.eleuther.ai/transformer-math/
+"""
+
+from gpu_mem_calculator.core.formulas import (
+    calculate_activation_memory,
+    calculate_overhead,
+    estimate_largest_layer_params,
+)
+from gpu_mem_calculator.core.models import (
+    MemoryBreakdown,
+    MemoryResult,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+
+class FSDPEngine(BaseEngine):
+    """PyTorch FSDP memory calculation.
+
+    FSDP shards model parameters, gradients, and optimizer states
+    across data parallel GPUs, similar to DeepSpeed ZeRO-3.
+
+    Sharding strategies:
+    - NO_SHARD: Equivalent to DDP (no sharding)
+    - SHARD_GRAD_OP: Shard gradients and optimizer states (like ZeRO-2)
+    - FULL_SHARD: Shard everything (like ZeRO-3)
+    """
+
+    def calculate_memory(self) -> MemoryResult:
+        """Calculate memory requirements for FSDP training.
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        sharding_strategy = self.engine_config.sharding_strategy
+
+        # Get largest layer params for FULL_SHARD
+        if self.model_config.largest_layer_params is None:
+            largest_layer_params = estimate_largest_layer_params(
+                hidden_size=self.model_config.hidden_size,
+                num_attention_heads=self.model_config.num_attention_heads,
+            )
+        else:
+            largest_layer_params = self.model_config.largest_layer_params
+
+        match sharding_strategy:
+            case "no_shard":
+                return self._calculate_no_shard()
+            case "shard_grad_op":
+                return self._calculate_shard_grad_op()
+            case "full_shard":
+                return self._calculate_full_shard(largest_layer_params)
+            case _:
+                # Default to full shard
+                return self._calculate_full_shard(largest_layer_params)
+
+    def _calculate_no_shard(self) -> MemoryResult:
+        """Calculate memory for NO_SHARD (same as DDP).
+
+        No sharding - each GPU holds a full copy of the model.
+        """
+        # Import PyTorch DDP engine
+        from gpu_mem_calculator.engines.pytorch import PyTorchDDPEngine
+
+        ddp_engine = PyTorchDDPEngine(
+            model_config=self.model_config,
+            training_config=self.training_config,
+            parallelism_config=self.parallelism_config,
+            engine_config=self.engine_config,
+            gpu_config=self.gpu_config,
+        )
+        return ddp_engine.calculate_memory()
+
+    def _calculate_shard_grad_op(self) -> MemoryResult:
+        """Calculate memory for SHARD_GRAD_OP.
+
+        Shards gradients and optimizer states across GPUs.
+        Similar to DeepSpeed ZeRO-2.
+
+        Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
+        Reference: https://blog.eleuther.ai/transformer-math/
+
+        Memory formula:
+        - Model parameters: Full model on each GPU (not sharded)
+        - Gradients: Sharded across GPUs
+        - Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW)
+
+        Note: Optimizer states = 12 bytes per param for Adam/AdamW
+        - 4 bytes: FP32 parameter copy
+        - 4 bytes: Momentum (FP32)
+        - 4 bytes: Variance (FP32)
+        """
+        num_params = self.model_config.num_parameters
+        num_gpus = self.total_num_gpus
+
+        # Model parameters (full model on each GPU)
+        model_params_gb = gb_from_bytes(num_params * 2)  # FP16/BF16
+
+        # Gradients (sharded)
+        gradients_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+        # Optimizer states (sharded) - 12 bytes per param for Adam/AdamW
+        optimizer_gb = gb_from_bytes((num_params * 12) / num_gpus)  # FP32
+
+        # Activations
+        activations_gb = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Overhead
+        base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+        overhead_gb = calculate_overhead(base_memory)
+
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_full_shard(self, largest_layer_params: int) -> MemoryResult:
+        """Calculate memory for FULL_SHARD.
+
+        Shards parameters, gradients, and optimizer states.
+        Similar to DeepSpeed ZeRO-3.
+
+        Reference: https://pytorch.org/tutorials/intermediate/FSDP_advanced.html
+        Reference: https://blog.eleuther.ai/transformer-math/
+
+        Memory formula:
+        - Largest layer: 4 * largest_layer_params (fp16 params + fp16 grads)
+        - Remaining parameters and gradients: Sharded across GPUs (2 bytes fp16 each)
+        - Optimizer states: Sharded across GPUs (12 bytes per param for Adam/AdamW in FP32)
+
+        Total per GPU: largest_layer_memory + 2 * params / num_gpus +
+                       2 * params / num_gpus + 12 * params / num_gpus
+                    = largest_layer_memory + 16 * params / num_gpus
+
+        Note: FSDP typically uses 12 bytes for optimizer states (not 16 like DeepSpeed ZeRO-3)
+        because FSDP doesn't keep an additional FP32 gradient copy in the optimizer states.
+        """
+        num_params = self.model_config.num_parameters
+        num_gpus = self.total_num_gpus
+
+        # Largest layer memory (fp16 params + fp16 grads gathered during compute)
+        largest_layer_memory_gb = gb_from_bytes(largest_layer_params * 4)
+
+        # Sharded parameters (fp16)
+        params_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+        # Sharded gradients (fp16)
+        gradients_per_gpu_gb = gb_from_bytes((num_params * 2) / num_gpus)
+
+        # Sharded optimizer states (FP32 for Adam/AdamW)
+        # 12 bytes per param: 4 bytes fp32 params copy + 4 bytes momentum + 4 bytes variance
+        optimizer_per_gpu_gb = gb_from_bytes((num_params * 12) / num_gpus)
+
+        # Model params in breakdown: largest layer (gathered) + sharded params
+        # This represents the total parameter memory on each GPU
+        model_params_gb = largest_layer_memory_gb + params_per_gpu_gb
+
+        # Activations
+        activations_gb = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Overhead
+        base_memory = (
+            largest_layer_memory_gb
+            + params_per_gpu_gb
+            + gradients_per_gpu_gb
+            + optimizer_per_gpu_gb
+            + activations_gb
+        )
+        overhead_gb = calculate_overhead(base_memory)
+
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_per_gpu_gb,
+            optimizer_states_gb=optimizer_per_gpu_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
diff --git a/src/gpu_mem_calculator/engines/megatron.py b/src/gpu_mem_calculator/engines/megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cacfdaca3b906b1058c25f3c61626da8bc079df
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/megatron.py
@@ -0,0 +1,257 @@
+"""Megatron-LM engine implementation.
+
+Implements memory calculations for Megatron-LM with tensor, pipeline,
+and sequence parallelism.
+
+Reference: https://github.com/NVIDIA/Megatron-LM
+Reference: https://arxiv.org/abs/1909.08053
+Reference: https://blog.eleuther.ai/transformer-math/
+"""
+
+from gpu_mem_calculator.core.formulas import (
+    calculate_activation_memory,
+    calculate_gradient_memory,
+    calculate_optimizer_memory,
+    calculate_overhead,
+    calculate_parameter_memory,
+)
+from gpu_mem_calculator.core.models import (
+    MemoryBreakdown,
+    MemoryResult,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+from gpu_mem_calculator.utils.precision import gb_from_bytes
+
+
+class MegatronLMEngine(BaseEngine):
+    """Megatron-LM memory calculation.
+
+    Megatron-LM uses tensor parallelism to split individual layers across GPUs,
+    and optionally pipeline parallelism to split layers across GPUs.
+    """
+
+    def calculate_memory(self) -> MemoryResult:
+        """Calculate memory requirements for Megatron-LM training.
+
+        Megatron-LM memory characteristics:
+        - Parameters are sharded across tensor parallel GPUs
+        - Gradients are sharded across tensor parallel GPUs
+        - Optimizer states can be sharded or replicated
+        - Activations depend on tensor/pipeline/sequence parallelism
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        tp_size = self.parallelism_config.tensor_parallel_size
+        pp_size = self.parallelism_config.pipeline_parallel_size
+        seq_parallel = self.parallelism_config.sequence_parallel
+
+        # 1. Model parameters (sharded by tensor parallelism)
+        # Each TP GPU holds 1/tp of the parameters
+        params_per_gpu = self.model_config.num_parameters / tp_size
+        model_params_gb = calculate_parameter_memory(
+            num_params=int(params_per_gpu),
+            dtype=self.training_config.dtype.value,
+        )
+
+        # 2. Gradients (sharded by tensor parallelism)
+        gradients_gb = calculate_gradient_memory(
+            num_params=int(params_per_gpu),
+            dtype=self.training_config.dtype.value,
+        )
+
+        # 3. Optimizer states
+        # In Megatron-LM, optimizer states are typically sharded similarly to parameters
+        # for tensor parallelism, but this can vary based on configuration
+        optimizer_gb = calculate_optimizer_memory(
+            num_params=int(params_per_gpu),
+            optimizer=self.training_config.optimizer.value,
+        )
+
+        # 4. Activations
+        # Activations are affected by:
+        # - Tensor parallelism: splits activations across TP GPUs
+        # - Pipeline parallelism: only holds activations for current stage
+        # - Sequence parallelism: splits sequence dimension
+        activations_gb = self._calculate_megatron_activations(
+            tp_size=tp_size,
+            pp_size=pp_size,
+            seq_parallel=seq_parallel,
+        )
+
+        # 5. Overhead
+        base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+        overhead_gb = calculate_overhead(base_memory)
+
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_megatron_activations(
+        self,
+        tp_size: int,
+        pp_size: int,
+        seq_parallel: bool,
+    ) -> float:
+        """Calculate activation memory for Megatron-LM.
+
+        Megatron-LM activations are affected by parallelism strategy:
+        - Tensor parallelism: splits hidden dimension
+        - Pipeline parallelism: only current stage's activations
+        - Sequence parallelism: splits sequence dimension
+
+        Args:
+            tp_size: Tensor parallelism size
+            pp_size: Pipeline parallelism size
+            seq_parallel: Whether sequence parallelism is enabled
+
+        Returns:
+            Activation memory in GB
+        """
+
+        # Base activation memory
+        base_activations = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=tp_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Adjust for pipeline parallelism
+        # Each PP stage only holds num_layers / pp_size layers
+        pp_factor = 1.0 / pp_size
+
+        # Adjust for sequence parallelism
+        # If enabled, splits sequence dimension across TP GPUs
+        if seq_parallel and tp_size > 1:
+            seq_factor = 1.0 / tp_size
+        else:
+            seq_factor = 1.0
+
+        return base_activations * pp_factor * seq_factor
+
+
+class MegatronDeepSpeedEngine(BaseEngine):
+    """Megatron-LM + DeepSpeed combined engine.
+
+    This combines Megatron-LM's tensor/pipeline parallelism with
+    DeepSpeed ZeRO's optimizer/gradient sharding.
+    """
+
+    def calculate_memory(self) -> MemoryResult:
+        """Calculate memory for Megatron-LM + DeepSpeed.
+
+        This uses:
+        - Megatron-LM for tensor/pipeline parallelism and activation memory
+        - DeepSpeed ZeRO for optimizer/gradient sharding
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        # Import DeepSpeed engine
+
+        # First calculate activation memory using Megatron-LM approach
+        tp_size = self.parallelism_config.tensor_parallel_size
+        pp_size = self.parallelism_config.pipeline_parallel_size
+        seq_parallel = self.parallelism_config.sequence_parallel
+
+        activations_gb = self._calculate_megatron_activations(
+            tp_size=tp_size,
+            pp_size=pp_size,
+            seq_parallel=seq_parallel,
+        )
+
+        # For parameters, gradients, optimizer - use DeepSpeed ZeRO logic
+        # But account for tensor parallelism (parameters are already split by TP)
+        tp_size = self.parallelism_config.tensor_parallel_size
+        params_per_gpu = self.model_config.num_parameters / tp_size
+
+        zero_stage = self.engine_config.zero_stage or 2
+        offload_optimizer = self.engine_config.offload_optimizer
+
+        # Model parameters (sharded by TP, then possibly by ZeRO)
+        if zero_stage >= 3:
+            # ZeRO-3 shards further
+            dp_size = self.parallelism_config.data_parallel_size
+            model_params_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
+        else:
+            # ZeRO-0/1/2 keeps parameters on each TP GPU
+            model_params_gb = gb_from_bytes(params_per_gpu * 2)
+
+        # Gradients
+        if zero_stage >= 2:
+            dp_size = self.parallelism_config.data_parallel_size
+            gradients_gb = gb_from_bytes((params_per_gpu * 2) / dp_size)
+        else:
+            gradients_gb = gb_from_bytes(params_per_gpu * 2)
+
+        # Optimizer states (12 bytes per param for Adam/AdamW in FP32)
+        if offload_optimizer.value == "cpu":
+            optimizer_gb = 0.0
+        else:
+            if zero_stage >= 1:
+                dp_size = self.parallelism_config.data_parallel_size
+                optimizer_gb = gb_from_bytes((params_per_gpu * 12) / dp_size)
+            else:
+                optimizer_gb = gb_from_bytes(params_per_gpu * 12)
+
+        # Overhead
+        base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+        overhead_gb = gb_from_bytes(base_memory * 0.2)
+
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_megatron_activations(
+        self,
+        tp_size: int,
+        pp_size: int,
+        seq_parallel: bool,
+    ) -> float:
+        """Calculate activation memory for Megatron-LM."""
+
+        # Base activation memory
+        base_activations = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=tp_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # Adjust for pipeline parallelism
+        pp_factor = 1.0 / pp_size
+
+        # Adjust for sequence parallelism
+        if seq_parallel and tp_size > 1:
+            seq_factor = 1.0 / tp_size
+        else:
+            seq_factor = 1.0
+
+        return base_activations * pp_factor * seq_factor
diff --git a/src/gpu_mem_calculator/engines/pytorch.py b/src/gpu_mem_calculator/engines/pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed3d8ae9df75c7870d7929617214e67cfc8a3dd7
--- /dev/null
+++ b/src/gpu_mem_calculator/engines/pytorch.py
@@ -0,0 +1,88 @@
+"""PyTorch DDP (Distributed Data Parallel) engine implementation.
+
+This is the baseline implementation without any memory optimizations.
+
+Reference: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+Reference: https://blog.eleuther.ai/transformer-math/
+"""
+
+from gpu_mem_calculator.core.formulas import (
+    calculate_activation_memory,
+    calculate_gradient_memory,
+    calculate_optimizer_memory,
+    calculate_overhead,
+    calculate_parameter_memory,
+)
+from gpu_mem_calculator.core.models import (
+    MemoryBreakdown,
+    MemoryResult,
+)
+from gpu_mem_calculator.engines.base import BaseEngine
+
+
+class PyTorchDDPEngine(BaseEngine):
+    """PyTorch DDP memory calculation.
+
+    DDP replicates the model on each GPU, so memory is not sharded.
+    Each GPU holds a full copy of the model, gradients, and optimizer states.
+    """
+
+    def calculate_memory(self) -> MemoryResult:
+        """Calculate memory requirements for PyTorch DDP training.
+
+        For DDP:
+        - Model parameters: Full model on each GPU
+        - Gradients: Full gradients on each GPU
+        - Optimizer states: Full optimizer states on each GPU (FP32)
+        - Activations: Batch size dependent, split by data parallel
+
+        Returns:
+            MemoryResult with complete memory breakdown
+        """
+        # 1. Model parameters (in the specified dtype)
+        model_params_gb = calculate_parameter_memory(
+            num_params=self.model_config.num_parameters,
+            dtype=self.training_config.dtype.value,
+        )
+
+        # 2. Gradients (same precision as parameters for mixed precision)
+        gradients_gb = calculate_gradient_memory(
+            num_params=self.model_config.num_parameters,
+            dtype=self.training_config.dtype.value,
+        )
+
+        # 3. Optimizer states (always FP32 for Adam/AdamW)
+        optimizer_gb = calculate_optimizer_memory(
+            num_params=self.model_config.num_parameters,
+            optimizer=self.training_config.optimizer.value,
+        )
+
+        # 4. Activations (depends on batch size and model architecture)
+        activations_gb = calculate_activation_memory(
+            batch_size=self.training_config.batch_size,
+            seq_len=self.model_config.max_seq_len,
+            hidden_size=self.model_config.hidden_size,
+            num_layers=self.model_config.num_layers,
+            num_attention_heads=self.model_config.num_attention_heads,
+            tensor_parallel_size=self.parallelism_config.tensor_parallel_size,
+            activation_checkpointing=self.training_config.activation_checkpointing,
+            moe_enabled=self.model_config.moe_enabled,
+            num_experts=self.model_config.num_experts,
+            top_k=self.model_config.top_k,
+            expert_intermediate_size=self.model_config.expert_intermediate_size,
+        )
+
+        # 5. Calculate overhead
+        base_memory = model_params_gb + gradients_gb + optimizer_gb + activations_gb
+        overhead_gb = calculate_overhead(base_memory)
+
+        # Create breakdown
+        breakdown = MemoryBreakdown(
+            model_params_gb=model_params_gb,
+            gradients_gb=gradients_gb,
+            optimizer_states_gb=optimizer_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
diff --git a/src/gpu_mem_calculator/exporters/__init__.py b/src/gpu_mem_calculator/exporters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd4570a402ff9017cf98c8058cb6324f5426ebe
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/__init__.py
@@ -0,0 +1,14 @@
+"""Framework configuration exporters."""
+
+from gpu_mem_calculator.exporters.accelerate import AccelerateExporter
+from gpu_mem_calculator.exporters.axolotl import AxolotlExporter
+from gpu_mem_calculator.exporters.lightning import LightningExporter
+from gpu_mem_calculator.exporters.manager import ExportFormat, ExportManager
+
+__all__ = [
+    "ExportManager",
+    "ExportFormat",
+    "AccelerateExporter",
+    "LightningExporter",
+    "AxolotlExporter",
+]
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49d72f4e608a33580e83f6de232bb5a3065ecbe2
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15b15a1825a045c015b20a455999e8fbcb48dbf3
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/accelerate.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0a3eb673d5b78a81d610973234ac6fb947e1777
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/axolotl.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a292093753ce8efa721a5353db1b5441bfba33df
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/lightning.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/__pycache__/manager.cpython-312.pyc b/src/gpu_mem_calculator/exporters/__pycache__/manager.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1326c58a49468421be1f2e7427848ec4900db182
Binary files /dev/null and b/src/gpu_mem_calculator/exporters/__pycache__/manager.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/exporters/accelerate.py b/src/gpu_mem_calculator/exporters/accelerate.py
new file mode 100644
index 0000000000000000000000000000000000000000..a60dadf9da95137d194ed3933ce703188b765b66
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/accelerate.py
@@ -0,0 +1,187 @@
+"""HuggingFace Accelerate configuration exporter.
+
+Generates configuration files for HuggingFace Accelerate distributed training.
+"""
+
+from gpu_mem_calculator.core.models import (
+    DType,
+    EngineConfig,
+    EngineType,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+
+class AccelerateExporter:
+    """Export configuration to HuggingFace Accelerate format.
+
+    Accelerate uses a YAML configuration file to configure distributed
+    training strategies including FSDP, DeepSpeed, and multi-GPU setups.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig,
+        engine_config: EngineConfig,
+        node_config: NodeConfig | None = None,
+    ) -> None:
+        """Initialize the Accelerate exporter.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings
+            engine_config: Training engine configuration
+            node_config: Multi-node configuration (optional)
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config
+        self.engine_config = engine_config
+        self.node_config = node_config
+
+    def export(self) -> dict:
+        """Export configuration to Accelerate format.
+
+        Returns:
+            Dictionary compatible with Accelerate config file format
+        """
+        config: dict = {
+            "compute_environment": (
+                "LOCAL_MACHINE"
+                if not self.node_config or self.node_config.num_nodes == 1
+                else "MULTI_GPU"
+            ),
+            "distributed_type": self._get_distributed_type(),
+            "mixed_precision": self._get_mixed_precision(),
+            "downcast_bf16": self._get_downcast_bf16(),
+        }
+
+        # Add multi-GPU configuration
+        if self.node_config and self.node_config.num_nodes > 1:
+            config["num_machines"] = self.node_config.num_nodes
+            config["num_processes"] = self.node_config.gpus_per_node or 1
+            config["main_process_port"] = 29500
+            config["main_training_function"] = "main"
+
+        # Add FSDP configuration if using FSDP
+        if self.engine_config.type == EngineType.FSDP:
+            config["fsdp_config"] = self._get_fsdp_config()
+
+        # Add DeepSpeed configuration if using DeepSpeed
+        if self.engine_config.type == EngineType.DEEPSPEED:
+            config["deepspeed_config"] = self._get_deepspeed_config()
+
+        return config
+
+    def _get_distributed_type(self) -> str:
+        """Get Accelerate distributed type."""
+        if self.engine_config.type == EngineType.FSDP:
+            return "FSDP"
+        elif self.engine_config.type == EngineType.DEEPSPEED:
+            return "DEEPSPEED"
+        elif self.parallelism_config.tensor_parallel_size > 1:
+            return "MEGATRON_LM"
+        elif self.parallelism_config.data_parallel_size > 1:
+            return "MULTI_GPU"
+        else:
+            return "NO"
+
+    def _get_mixed_precision(self) -> str:
+        """Get mixed precision setting."""
+        dtype_map = {
+            DType.BF16: "bf16",
+            DType.FP16: "fp16",
+            DType.FP32: "no",
+        }
+        return dtype_map.get(self.training_config.dtype, "no")
+
+    def _get_downcast_bf16(self) -> str:
+        """Get downcast BF16 setting."""
+        return "no" if self.training_config.dtype == DType.BF16 else "no"
+
+    def _get_fsdp_config(self) -> dict:
+        """Get FSDP-specific configuration."""
+        sharding_strategy_map = {
+            "no_shard": "NO_SHARD",
+            "shard_grad_op": "SHARD_GRAD_OP",
+            "full_shard": "FULL_SHARD",
+        }
+
+        config = {
+            "fsdp_sharding_strategy": sharding_strategy_map.get(
+                self.engine_config.sharding_strategy, "FULL_SHARD"
+            ),
+            "fsdp_offload_params": False,
+            "fsdp_origin_params": True,
+            "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+            "fsdp_transformer_layer_cls_to_wrap": self._get_transformer_layer_cls(),
+            "fsdp_backward_prefetch": "BACKWARD_PRE",
+            "fsdp_forward_prefetch": False,
+            "fsdp_use_orig_params": True,
+            "fsdp_cpu_ram_efficient_loading": True,
+        }
+
+        # Add activation checkpointing if enabled
+        if self.training_config.activation_checkpointing > 0:
+            config["fsdp_activation_checkpointing"] = True
+
+        return config
+
+    def _get_deepspeed_config(self) -> dict:
+        """Get DeepSpeed-specific configuration."""
+        zero_opt: dict = {
+            "stage": self.engine_config.zero_stage or 2,
+        }
+
+        config: dict = {
+            "train_batch_size": self.training_config.batch_size,
+            "train_micro_batch_size_per_gpu": self.training_config.batch_size,
+            "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+            "zero_optimization": zero_opt,
+            "bf16": {"enabled": self.training_config.dtype == DType.BF16},
+            "fp16": {"enabled": self.training_config.dtype == DType.FP16},
+            "gradient_clipping": 1.0,
+            "prescale_gradients": False,
+            "steps_per_print": 100,
+        }
+
+        # Add offload configuration if specified
+        if self.engine_config.offload_optimizer != "none":
+            config["zero_optimization"]["offload_optimizer"] = {
+                "device": "cpu" if self.engine_config.offload_optimizer == "cpu" else "nvme",
+                "pin_memory": True,
+            }
+
+        if self.engine_config.offload_param != "none":
+            config["zero_optimization"]["offload_param"] = {
+                "device": "cpu" if self.engine_config.offload_param == "cpu" else "nvme",
+                "pin_memory": True,
+            }
+
+        return config
+
+    def _get_transformer_layer_cls(self) -> list[str]:
+        """Get transformer layer class names for FSDP auto-wrapping.
+
+        Returns a list of common transformer layer class names based on model architecture.
+        """
+        # Common transformer layer class names
+        common_layers = [
+            "BertLayer",
+            "GPTJBlock",
+            "GPT2Block",
+            "BloomBlock",
+            "LlamaDecoderLayer",
+            "MistralDecoderLayer",
+            "MixtralDecoderLayer",
+            "Qwen2DecoderLayer",
+            "GemmaDecoderLayer",
+        ]
+
+        # Could be customized based on model_config.name
+        return common_layers
diff --git a/src/gpu_mem_calculator/exporters/axolotl.py b/src/gpu_mem_calculator/exporters/axolotl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c5d397f7972de144203362faec743066287ced5
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/axolotl.py
@@ -0,0 +1,238 @@
+"""Axolotl configuration exporter.
+
+Generates configuration files for Axolotl fine-tuning framework.
+"""
+
+from gpu_mem_calculator.core.models import (
+    DType,
+    EngineConfig,
+    EngineType,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+
+class AxolotlExporter:
+    """Export configuration to Axolotl YAML format.
+
+    Axolotl uses a YAML configuration file for fine-tuning LLMs
+    with various backends including DeepSpeed, FSDP, and XLA.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig,
+        engine_config: EngineConfig,
+        node_config: NodeConfig | None = None,
+    ) -> None:
+        """Initialize the Axolotl exporter.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings
+            engine_config: Training engine configuration
+            node_config: Multi-node configuration (optional)
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config
+        self.engine_config = engine_config
+        self.node_config = node_config
+
+    def export(self) -> dict:
+        """Export configuration to Axolotl YAML format.
+
+        Returns:
+            Dictionary compatible with Axolotl config file format
+        """
+        config = {
+            # Base model configuration
+            "base_model": self._get_base_model(),
+            "model_type": self._get_model_type(),
+            # Tokenizer
+            "tokenizer_type": "AutoTokenizer",
+            # Training configuration
+            "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+            "batch_size": self.training_config.batch_size,
+            "micro_batch_size": self.training_config.batch_size,
+            "num_epochs": 3,
+            "learning_rate": 2e-4,
+            "optimizer": (
+                "adamw_bnb_8bit" if self.training_config.dtype == DType.BF16 else "adamw_torch"
+            ),
+            "bf16": self.training_config.dtype == DType.BF16,
+            "fp16": self.training_config.dtype == DType.FP16,
+            "tf32": True,
+            "gradient_checkpointing": self.training_config.activation_checkpointing > 0,
+        }
+
+        # Add special tokens configuration
+        config.update(
+            {
+                "special_tokens": {
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                    "unk_token": "<unk>",
+                    "pad_token": "<pad>",
+                }
+            }
+        )
+
+        # Add distributed training configuration
+        if self.engine_config.type == EngineType.DEEPSPEED:
+            config["deepspeed"] = self._get_deepspeed_config()
+        elif self.engine_config.type == EngineType.FSDP:
+            config["fsdp"] = self._get_fsdp_config()
+
+        # Add multi-GPU configuration
+        if self.node_config and self.node_config.num_nodes > 1:
+            config["num_nodes"] = self.node_config.num_nodes
+            config["gpus_per_node"] = self.node_config.gpus_per_node
+
+        # Add additional training parameters
+        config.update(
+            {
+                "val_set_size": 0.1,  # 10% validation
+                "output_dir": "./output",
+                "logging_steps": 10,
+                "save_steps": 100,
+                "eval_steps": 100,
+                "save_total_limit": 2,
+                "lr_scheduler": "cosine",
+                "warmup_ratio": 0.03,
+                "weight_decay": 0.0,
+                "max_grad_norm": 1.0,
+            }
+        )
+
+        return config
+
+    def _get_base_model(self) -> str:
+        """Get base model path/name.
+
+        Returns a placeholder or extracts from model_config.name
+        """
+        # Try to construct a reasonable model path
+        name_map = {
+            "llama2-7b": "meta-llama/Llama-2-7b-hf",
+            "llama2-13b": "meta-llama/Llama-2-13b-hf",
+            "llama2-70b": "meta-llama/Llama-2-70b-hf",
+            "mistral-7b": "mistralai/Mistral-7B-v0.1",
+            "mixtral-8x7b": "mistralai/Mixtral-8x7B-v0.1",
+            "gpt3-175b": "gpt3-175b-placeholder",  # Not on HF
+        }
+
+        return name_map.get(self.model_config.name.lower(), self.model_config.name)
+
+    def _get_model_type(self) -> str:
+        """Get model type for Axolotl."""
+        model_type_map = {
+            "llama": "LlamaForCausalLM",
+            "mistral": "MistralForCausalLM",
+            "mixtral": "MixtralForCausalLM",
+            "qwen": "Qwen2ForCausalLM",
+            "gemma": "GemmaForCausalLM",
+            "bloom": "BloomForCausalLM",
+            "gpt2": "GPT2LMHeadModel",
+            "gptj": "GPTJForCausalLM",
+            "bert": "BertForMaskedLM",
+        }
+
+        model_name_lower = self.model_config.name.lower()
+        for key, value in model_type_map.items():
+            if key in model_name_lower:
+                return value
+
+        return "LlamaForCausalLM"  # Default
+
+    def _get_deepspeed_config(self) -> dict:
+        """Get DeepSpeed configuration for Axolotl."""
+        zero_opt: dict = {
+            "stage": self.engine_config.zero_stage or 2,
+        }
+
+        config: dict = {
+            "zero_optimization": zero_opt,
+            "bf16": {"enabled": self.training_config.dtype == DType.BF16},
+            "fp16": {"enabled": self.training_config.dtype == DType.FP16},
+            "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+            "train_micro_batch_size_per_gpu": self.training_config.batch_size,
+            "train_batch_size": self.training_config.batch_size
+            * self.training_config.gradient_accumulation_steps,
+        }
+
+        # Add offload configuration
+        if self.engine_config.offload_optimizer != "none":
+            config["zero_optimization"]["offload_optimizer"] = {
+                "device": "cpu" if self.engine_config.offload_optimizer == "cpu" else "nvme",
+                "pin_memory": True,
+            }
+
+        if self.engine_config.offload_param != "none":
+            config["zero_optimization"]["offload_param"] = {
+                "device": "cpu" if self.engine_config.offload_param == "cpu" else "nvme",
+                "pin_memory": True,
+            }
+
+        return config
+
+    def _get_fsdp_config(self) -> dict:
+        """Get FSDP configuration for Axolotl."""
+        sharding_strategy_map = {
+            "no_shard": "NO_SHARD",
+            "shard_grad_op": "SHARD_GRAD_OP",
+            "full_shard": "FULL_SHARD",
+        }
+
+        config = {
+            "fsdp_sharding_strategy": sharding_strategy_map.get(
+                self.engine_config.sharding_strategy, "FULL_SHARD"
+            ),
+            "fsdp_offload_params": False,
+            "fsdp_origin_params": True,
+            "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+            "fsdp_transformer_layer_cls_to_wrap": self._get_transformer_layer_cls(),
+            "fsdp_backward_prefetch": "BACKWARD_PRE",
+            "fsdp_forward_prefetch": False,
+            "fsdp_use_orig_params": True,
+            "fsdp_cpu_ram_efficient_loading": True,
+        }
+
+        return config
+
+    def _get_transformer_layer_cls(self) -> str:
+        """Get transformer layer class name."""
+        model_layer_map = {
+            "llama": "LlamaDecoderLayer",
+            "mistral": "MistralDecoderLayer",
+            "mixtral": "MixtralDecoderLayer",
+            "qwen": "Qwen2DecoderLayer",
+            "gemma": "GemmaDecoderLayer",
+            "bloom": "BloomBlock",
+            "gpt2": "GPT2Block",
+            "gptj": "GPTJBlock",
+            "bert": "BertLayer",
+        }
+
+        model_name_lower = self.model_config.name.lower()
+        for key, value in model_layer_map.items():
+            if key in model_name_lower:
+                return value
+
+        return "LlamaDecoderLayer"  # Default
+
+    def export_yaml(self) -> str:
+        """Generate YAML configuration string.
+
+        Returns:
+            YAML-formatted configuration string
+        """
+        import yaml  # type: ignore[import-untyped]
+
+        config = self.export()
+        return yaml.dump(config, default_flow_style=False, sort_keys=False)  # type: ignore[no-any-return]
diff --git a/src/gpu_mem_calculator/exporters/lightning.py b/src/gpu_mem_calculator/exporters/lightning.py
new file mode 100644
index 0000000000000000000000000000000000000000..116f465afb4663d0b4e4e37d71c8444294506d6a
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/lightning.py
@@ -0,0 +1,219 @@
+"""PyTorch Lightning configuration exporter.
+
+Generates configuration and trainer setup for PyTorch Lightning training.
+"""
+
+from gpu_mem_calculator.core.models import (
+    DType,
+    EngineConfig,
+    EngineType,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+
+
+class LightningExporter:
+    """Export configuration to PyTorch Lightning format.
+
+    Lightning uses a Trainer class with various strategies for distributed
+    training including DDP, FSDP, and DeepSpeed.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig,
+        engine_config: EngineConfig,
+        node_config: NodeConfig | None = None,
+    ) -> None:
+        """Initialize the Lightning exporter.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings
+            engine_config: Training engine configuration
+            node_config: Multi-node configuration (optional)
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config
+        self.engine_config = engine_config
+        self.node_config = node_config
+
+    def export(self) -> dict:
+        """Export configuration to Lightning Trainer format.
+
+        Returns:
+            Dictionary with Trainer configuration
+        """
+        config = {
+            "trainer": {
+                "accelerator": "auto",
+                "devices": self._get_num_devices(),
+                "num_nodes": self._get_num_nodes(),
+                "strategy": self._get_strategy(),
+                "precision": self._get_precision(),
+                "max_epochs": 1,  # Placeholder
+                "accumulate_grad_batches": self.training_config.gradient_accumulation_steps,
+                "gradient_clip_val": 1.0,
+                "log_every_n_steps": 50,
+            },
+            "model_config": {
+                "model_name": self.model_config.name,
+                "num_parameters": self.model_config.num_parameters,
+                "hidden_size": self.model_config.hidden_size,
+                "num_layers": self.model_config.num_layers,
+                "num_attention_heads": self.model_config.num_attention_heads,
+                "max_seq_len": self.model_config.max_seq_len,
+            },
+        }
+
+        # Add strategy-specific configuration
+        if self.engine_config.type == EngineType.DEEPSPEED:
+            config["deepspeed_config"] = self._get_deepspeed_config()
+        elif self.engine_config.type == EngineType.FSDP:
+            config["fsdp_config"] = self._get_fsdp_config()
+
+        return config
+
+    def _get_num_devices(self) -> int | str:
+        """Get number of devices."""
+        if self.node_config and self.node_config.gpus_per_node:
+            return self.node_config.gpus_per_node
+        return "auto"
+
+    def _get_num_nodes(self) -> int:
+        """Get number of nodes."""
+        if self.node_config:
+            return self.node_config.num_nodes
+        return 1
+
+    def _get_strategy(self) -> str | dict:
+        """Get Lightning training strategy."""
+        if self.engine_config.type == EngineType.FSDP:
+            return "fsdp"
+        elif self.engine_config.type == EngineType.DEEPSPEED:
+            return "deepspeed"
+        elif self.parallelism_config.data_parallel_size > 1:
+            return "ddp"
+        else:
+            return "auto"
+
+    def _get_precision(self) -> str:
+        """Get precision setting."""
+        dtype_map = {
+            DType.BF16: "bf16-mixed",
+            DType.FP16: "16-mixed",
+            DType.FP32: "32",
+        }
+        return dtype_map.get(self.training_config.dtype, "32")
+
+    def _get_deepspeed_config(self) -> dict:
+        """Get DeepSpeed configuration for Lightning."""
+        zero_opt: dict = {
+            "stage": self.engine_config.zero_stage or 2,
+        }
+
+        config: dict = {
+            "zero_stage": self.engine_config.zero_stage or 2,
+            "zero_optimization": zero_opt,
+            "bf16": {"enabled": self.training_config.dtype == DType.BF16},
+            "fp16": {"enabled": self.training_config.dtype == DType.FP16},
+            "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+            "train_micro_batch_size_per_gpu": self.training_config.batch_size,
+            "train_batch_size": self.training_config.batch_size
+            * self.training_config.gradient_accumulation_steps,
+        }
+
+        # Add offload configuration
+        if self.engine_config.offload_optimizer != "none":
+            config["zero_optimization"]["offload_optimizer"] = {
+                "device": "cpu" if self.engine_config.offload_optimizer == "cpu" else "nvme",
+            }
+
+        if self.engine_config.offload_param != "none":
+            config["zero_optimization"]["offload_param"] = {
+                "device": "cpu" if self.engine_config.offload_param == "cpu" else "nvme",
+            }
+
+        return config
+
+    def _get_fsdp_config(self) -> dict:
+        """Get FSDP configuration for Lightning."""
+        sharding_strategy_map = {
+            "no_shard": "NO_SHARD",
+            "shard_grad_op": "SHARD_GRAD_OP",
+            "full_shard": "FULL_SHARD",
+        }
+
+        config = {
+            "sharding_strategy": sharding_strategy_map.get(
+                self.engine_config.sharding_strategy, "FULL_SHARD"
+            ),
+            "cpu_ram_efficient_loading": True,
+            "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+            "transformer_cls_name": self._get_transformer_cls_name(),
+            "activation_checkpointing": self.training_config.activation_checkpointing > 0,
+        }
+
+        return config
+
+    def _get_transformer_cls_name(self) -> str:
+        """Get transformer class name for FSDP wrapping."""
+        # Map common model names to their layer classes
+        model_layer_map = {
+            "llama": "LlamaDecoderLayer",
+            "mistral": "MistralDecoderLayer",
+            "mixtral": "MixtralDecoderLayer",
+            "qwen": "Qwen2DecoderLayer",
+            "gemma": "GemmaDecoderLayer",
+            "bloom": "BloomBlock",
+            "gpt2": "GPT2Block",
+            "gptj": "GPTJBlock",
+            "bert": "BertLayer",
+        }
+
+        # Try to match based on model name
+        model_name_lower = self.model_config.name.lower()
+        for key, value in model_layer_map.items():
+            if key in model_name_lower:
+                return value
+
+        return "LlamaDecoderLayer"  # Default
+
+    def export_code(self) -> str:
+        """Generate Python code for Lightning Trainer setup.
+
+        Returns:
+            String with Python code
+        """
+        config = self.export()
+
+        code = f"""import pytorch_lightning as pl
+from pytorch_lightning.strategies import DeepSpeedStrategy, FSDPStrategy
+
+# Model configuration
+model_config = {config["model_config"]}
+
+# Trainer configuration
+trainer = pl.Trainer(
+    accelerator="{config["trainer"]["accelerator"]}",
+    devices={config["trainer"]["devices"]},
+    num_nodes={config["trainer"]["num_nodes"]},
+    strategy="{config["trainer"]["strategy"]}",
+    precision="{config["trainer"]["precision"]}",
+    max_epochs={config["trainer"]["max_epochs"]},
+    accumulate_grad_batches={config["trainer"]["accumulate_grad_batches"]},
+    gradient_clip_val={config["trainer"]["gradient_clip_val"]},
+    log_every_n_steps={config["trainer"]["log_every_n_steps"]},
+)
+
+# Training loop
+# model = YourModel(model_config)
+# trainer.fit(model)
+"""
+        return code
diff --git a/src/gpu_mem_calculator/exporters/manager.py b/src/gpu_mem_calculator/exporters/manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d5f7fcdacc915d31d7c836d0dc8125fc82f8e8b
--- /dev/null
+++ b/src/gpu_mem_calculator/exporters/manager.py
@@ -0,0 +1,223 @@
+"""Export manager for framework configurations.
+
+Provides a unified interface for exporting configurations to various
+training framework formats.
+"""
+
+from enum import Enum
+
+from gpu_mem_calculator.core.models import (
+    EngineConfig,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+from gpu_mem_calculator.exporters.accelerate import AccelerateExporter
+from gpu_mem_calculator.exporters.axolotl import AxolotlExporter
+from gpu_mem_calculator.exporters.lightning import LightningExporter
+
+
+class ExportFormat(str, Enum):
+    """Supported export formats."""
+
+    ACCELERATE = "accelerate"
+    LIGHTNING = "lightning"
+    AXOLOTL = "axolotl"
+    DEEPSPEED = "deepspeed"
+    YAML = "yaml"
+    JSON = "json"
+
+
+class ExportManager:
+    """Unified export manager for all framework configurations.
+
+    This class provides a simple interface to export training
+    configurations to various framework formats.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        training_config: TrainingConfig,
+        parallelism_config: ParallelismConfig,
+        engine_config: EngineConfig,
+        node_config: NodeConfig | None = None,
+    ) -> None:
+        """Initialize the export manager.
+
+        Args:
+            model_config: Model architecture configuration
+            training_config: Training hyperparameters
+            parallelism_config: Parallelism settings
+            engine_config: Training engine configuration
+            node_config: Multi-node configuration (optional)
+        """
+        self.model_config = model_config
+        self.training_config = training_config
+        self.parallelism_config = parallelism_config
+        self.engine_config = engine_config
+        self.node_config = node_config
+
+        # Initialize exporters
+        self.accelerate_exporter = AccelerateExporter(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            node_config=node_config,
+        )
+
+        self.lightning_exporter = LightningExporter(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            node_config=node_config,
+        )
+
+        self.axolotl_exporter = AxolotlExporter(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            node_config=node_config,
+        )
+
+    def export(self, format: ExportFormat | str) -> dict | str:
+        """Export configuration to specified format.
+
+        Args:
+            format: Export format (accelerate, lightning, axolotl, deepspeed, yaml, json)
+
+        Returns:
+            Dictionary or string with exported configuration
+        """
+        format_str = format.value if isinstance(format, ExportFormat) else format
+
+        match format_str:
+            case ExportFormat.ACCELERATE:
+                return self.accelerate_exporter.export()
+            case ExportFormat.LIGHTNING:
+                return self.lightning_exporter.export()
+            case ExportFormat.AXOLOTL:
+                return self.axolotl_exporter.export()
+            case ExportFormat.DEEPSPEED:
+                # DeepSpeed config is embedded in accelerate export
+                config = self.accelerate_exporter.export()
+                return config.get("deepspeed_config", {})  # type: ignore[no-any-return]
+            case ExportFormat.YAML:
+                return self._export_yaml()
+            case ExportFormat.JSON:
+                return self._export_json()
+            case _:
+                raise ValueError(f"Unknown export format: {format}")
+
+    def export_to_file(
+        self,
+        format: ExportFormat | str,
+        filepath: str,
+    ) -> None:
+        """Export configuration to a file.
+
+        Args:
+            format: Export format
+            filepath: Path to output file
+        """
+        config = self.export(format)
+
+        if isinstance(config, dict):
+            if format == ExportFormat.YAML or (
+                isinstance(format, str) and format.lower() == "yaml"
+            ):
+                import yaml  # type: ignore[import-untyped]
+
+                with open(filepath, "w") as f:
+                    yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+            else:
+                import json
+
+                with open(filepath, "w") as f:
+                    json.dump(config, f, indent=2)
+        else:
+            with open(filepath, "w") as f:
+                f.write(config)
+
+    def _export_yaml(self) -> str:
+        """Export configuration to generic YAML format.
+
+        Returns:
+            YAML-formatted configuration string
+        """
+        import yaml  # type: ignore[import-untyped]
+
+        config = {
+            "model": {
+                "name": self.model_config.name,
+                "num_parameters": self.model_config.num_parameters,
+                "num_layers": self.model_config.num_layers,
+                "hidden_size": self.model_config.hidden_size,
+                "num_attention_heads": self.model_config.num_attention_heads,
+                "vocab_size": self.model_config.vocab_size,
+                "max_seq_len": self.model_config.max_seq_len,
+                "moe_enabled": self.model_config.moe_enabled,
+            },
+            "training": {
+                "batch_size": self.training_config.batch_size,
+                "gradient_accumulation_steps": self.training_config.gradient_accumulation_steps,
+                "optimizer": self.training_config.optimizer.value,
+                "dtype": self.training_config.dtype.value,
+                "activation_checkpointing": self.training_config.activation_checkpointing,
+            },
+            "parallelism": {
+                "tensor_parallel_size": self.parallelism_config.tensor_parallel_size,
+                "pipeline_parallel_size": self.parallelism_config.pipeline_parallel_size,
+                "data_parallel_size": self.parallelism_config.data_parallel_size,
+                "sequence_parallel": self.parallelism_config.sequence_parallel,
+            },
+            "engine": {
+                "type": self.engine_config.type.value,
+                "zero_stage": self.engine_config.zero_stage,
+                "offload_optimizer": self.engine_config.offload_optimizer.value,
+                "offload_param": self.engine_config.offload_param.value,
+            },
+        }
+
+        # Add node configuration if multi-node
+        if self.node_config and self.node_config.num_nodes > 1:
+            config["multinode"] = {
+                "num_nodes": self.node_config.num_nodes,
+                "gpus_per_node": self.node_config.gpus_per_node,
+                "interconnect_type": self.node_config.interconnect_type.value,
+            }
+
+        return yaml.dump(config, default_flow_style=False, sort_keys=False)  # type: ignore[no-any-return]
+
+    def _export_json(self) -> str:
+        """Export configuration to JSON format.
+
+        Returns:
+            JSON-formatted configuration string
+        """
+        import json
+
+        config = {
+            "model": self.model_config.model_dump(),
+            "training": self.training_config.model_dump(),
+            "parallelism": self.parallelism_config.model_dump(),
+            "engine": self.engine_config.model_dump(),
+        }
+
+        # Add node configuration if multi-node
+        if self.node_config:
+            config["multinode"] = self.node_config.model_dump()
+
+        return json.dumps(config, indent=2)
+
+    def get_supported_formats(self) -> list[str]:
+        """Get list of supported export formats.
+
+        Returns:
+            List of format names
+        """
+        return [f.value for f in ExportFormat]
diff --git a/src/gpu_mem_calculator/inference/__init__.py b/src/gpu_mem_calculator/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7f29f09e776240cfef2a71e8a8f2f045556c38
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/__init__.py
@@ -0,0 +1,11 @@
+"""Inference memory calculation module."""
+
+from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
+from gpu_mem_calculator.inference.huggingface import HuggingFaceEngine
+from gpu_mem_calculator.inference.sglang import SGLangEngine
+
+__all__ = [
+    "InferenceMemoryCalculator",
+    "HuggingFaceEngine",
+    "SGLangEngine",
+]
diff --git a/src/gpu_mem_calculator/inference/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f549edd147c1016c57bd3e9d566a362fd8aa02e
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/base.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..510107de82cf36f8cf24fe6272d1a162d64d6091
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/base.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/calculator.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/calculator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b63d6762499335b30f4bdaeba5ca41b0987b0716
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/calculator.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/huggingface.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/huggingface.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3c9d999375640bad52610e5f46959aa4a8cabed
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/huggingface.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/sglang.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/sglang.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73ac1bc60c86c990b249bbfcf9ad7bf7d7ecf449
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/sglang.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/tensorrt_llm.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/tensorrt_llm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd45bbae55c34d36c33fa46fa93badb042c5bc15
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/tensorrt_llm.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/tgi.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/tgi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cd32ee65f1deb416d0bfd72a4d988250003891
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/tgi.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/__pycache__/vllm.cpython-312.pyc b/src/gpu_mem_calculator/inference/__pycache__/vllm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..348530adf1b79a493b07d30b1256d3ac47b827fa
Binary files /dev/null and b/src/gpu_mem_calculator/inference/__pycache__/vllm.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/inference/base.py b/src/gpu_mem_calculator/inference/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e185e21d1d8a930902be6e8ea3c9dda97b71a3e3
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/base.py
@@ -0,0 +1,185 @@
+"""Base class for inference engine implementations."""
+
+from abc import ABC, abstractmethod
+
+from gpu_mem_calculator.core.models import (
+    GPUConfig,
+    InferenceConfig,
+    InferenceMemoryBreakdown,
+    InferenceMemoryResult,
+    ModelConfig,
+)
+
+
+class BaseInferenceEngine(ABC):
+    """Abstract base class for inference engine memory calculation.
+
+    Each inference engine (vLLM, TGI, TensorRT-LLM, etc.)
+    should implement this interface to provide engine-specific
+    memory calculations.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        inference_config: InferenceConfig,
+        gpu_config: GPUConfig,
+    ) -> None:
+        """Initialize the inference engine with configuration.
+
+        Args:
+            model_config: Model architecture configuration
+            inference_config: Inference hyperparameters
+            gpu_config: Hardware configuration
+        """
+        self.model_config = model_config
+        self.inference_config = inference_config
+        self.gpu_config = gpu_config
+
+    @abstractmethod
+    def calculate_memory(self) -> InferenceMemoryResult:
+        """Calculate memory requirements for inference.
+
+        This is the main method that should be implemented by each engine.
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        pass
+
+    def _check_feasibility(
+        self,
+        total_memory_per_gpu: float,
+    ) -> tuple[bool, float, int | None]:
+        """Check if the configuration fits on available GPU.
+
+        Args:
+            total_memory_per_gpu: Total memory required per GPU
+
+        Returns:
+            Tuple of (fits_on_gpu, utilization_percent, max_batch_size)
+        """
+        available_memory = (
+            self.gpu_config.gpu_memory_gb * self.inference_config.gpu_memory_utilization
+        )
+        utilization_percent = (total_memory_per_gpu / self.gpu_config.gpu_memory_gb) * 100
+
+        fits_on_gpu = total_memory_per_gpu <= available_memory
+
+        # Find max batch size that fits
+        max_batch_size = None
+        if fits_on_gpu:
+            # Try to estimate max batch size
+            # This is a simplified heuristic
+            current_batch = self.inference_config.batch_size
+            overhead_per_token = total_memory_per_gpu / current_batch
+            potential_max_batch = int(available_memory / overhead_per_token)
+            max_batch_size = max(1, potential_max_batch)
+
+        return fits_on_gpu, utilization_percent, max_batch_size
+
+    def _create_result(
+        self,
+        breakdown: InferenceMemoryBreakdown,
+    ) -> InferenceMemoryResult:
+        """Create an InferenceMemoryResult from breakdown.
+
+        Args:
+            breakdown: Memory breakdown by component
+
+        Returns:
+            Complete InferenceMemoryResult
+        """
+        total_memory_per_gpu = breakdown.total_memory_gb
+        num_gpus = self.inference_config.tensor_parallel_size
+        total_memory_all_gpus = total_memory_per_gpu * num_gpus
+
+        fits_on_gpu, utilization_percent, max_batch_size = self._check_feasibility(
+            total_memory_per_gpu
+        )
+
+        # Estimate throughput (simplified heuristic)
+        estimated_throughput = None
+        if fits_on_gpu:
+            # Rough estimate: assumes each token takes ~50ms to process
+            # This varies wildly based on hardware and model
+            tokens_per_batch = self.inference_config.batch_size * self._get_effective_seq_len()
+            estimated_throughput = tokens_per_batch / 0.05  # 50ms per batch
+
+        return InferenceMemoryResult(
+            total_memory_per_gpu_gb=total_memory_per_gpu,
+            total_memory_all_gpus_gb=total_memory_all_gpus,
+            breakdown=breakdown,
+            fits_on_gpu=fits_on_gpu,
+            memory_utilization_percent=utilization_percent,
+            max_supported_batch_size=max_batch_size,
+            estimated_throughput_tokens_per_sec=estimated_throughput,
+        )
+
+    def _get_effective_seq_len(self) -> int:
+        """Get effective sequence length for inference."""
+        return self.inference_config.max_seq_len or self.model_config.max_seq_len
+
+    def _get_kv_cache_bytes_per_token(self) -> int:
+        """Calculate KV cache bytes per token.
+
+        Returns:
+            Bytes per token for KV cache (considering quantization)
+        """
+        # Base: 2 * num_layers * num_heads * head_dim * bytes_per_value
+        # For each token, we store K and V for each layer
+        num_layers = self.model_config.num_layers
+        num_heads = self.model_config.num_attention_heads
+        head_dim = self.model_config.hidden_size // num_heads
+
+        # Determine bytes per value based on quantization
+        quantization = self.inference_config.kv_cache_quantization
+        bytes_per_value = {
+            "none": 2,  # FP16/BF16
+            "int8": 1,
+            "fp8": 1,
+            "int4": 0.5,
+        }[quantization.value]
+
+        # KV cache = 2 (K and V) * num_layers * num_heads * head_dim * bytes_per_value
+        kv_bytes_per_token = 2 * num_layers * num_heads * head_dim * bytes_per_value
+
+        return int(kv_bytes_per_token)
+
+    def _calculate_model_params_bytes(self) -> int:
+        """Calculate model parameter memory in bytes.
+
+        Returns:
+            Bytes needed for model parameters
+        """
+        dtype_bytes = {
+            "fp32": 4,
+            "fp16": 2,
+            "bf16": 2,
+            "int8": 1,
+            "int4": 0.5,
+        }
+
+        # Assume model is loaded in BF16/FP16 for inference
+        dtype = "bf16"
+        num_params = self.model_config.num_parameters
+
+        return int(num_params * dtype_bytes[dtype])
+
+    def _calculate_kv_cache_bytes(self, batch_size: int) -> int:
+        """Calculate KV cache memory in bytes.
+
+        Args:
+            batch_size: Batch size to calculate for
+
+        Returns:
+            Bytes needed for KV cache
+        """
+        if not self.inference_config.use_kv_cache:
+            return 0
+
+        seq_len = self._get_effective_seq_len()
+        kv_bytes_per_token = self._get_kv_cache_bytes_per_token()
+
+        # KV cache = batch_size * seq_len * kv_bytes_per_token
+        return batch_size * seq_len * kv_bytes_per_token
diff --git a/src/gpu_mem_calculator/inference/calculator.py b/src/gpu_mem_calculator/inference/calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd8f03a8f2cb6c96b3a4541321ec908e2e477fa
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/calculator.py
@@ -0,0 +1,104 @@
+"""Main inference memory calculator.
+
+Orchestrates the inference memory calculation by selecting the appropriate
+inference engine and aggregating results.
+"""
+
+from gpu_mem_calculator.core.models import (
+    GPUConfig,
+    InferenceConfig,
+    InferenceEngineType,
+    InferenceMemoryResult,
+    ModelConfig,
+)
+from gpu_mem_calculator.inference.huggingface import HuggingFaceEngine
+from gpu_mem_calculator.inference.sglang import SGLangEngine
+from gpu_mem_calculator.inference.tensorrt_llm import TensorRTLLMEngine
+from gpu_mem_calculator.inference.tgi import TGIEngine
+from gpu_mem_calculator.inference.vllm import VLLMEngine
+
+# Type alias for inference engine types
+InferenceEngineAlias = HuggingFaceEngine | VLLMEngine | TGIEngine | TensorRTLLMEngine | SGLangEngine
+
+
+class InferenceMemoryCalculator:
+    """Main inference memory calculator.
+
+    This class provides a high-level interface for calculating
+    GPU memory requirements for LLM inference with different engines.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        inference_config: InferenceConfig,
+        gpu_config: GPUConfig | None = None,
+    ) -> None:
+        """Initialize the inference calculator.
+
+        Args:
+            model_config: Model architecture configuration
+            inference_config: Inference hyperparameters
+            gpu_config: Hardware configuration (default: 1x 80GB GPU)
+        """
+        self.model_config = model_config
+        self.inference_config = inference_config
+        self.gpu_config = gpu_config or GPUConfig()
+
+    def calculate(self, engine_type: InferenceEngineType) -> InferenceMemoryResult:
+        """Calculate inference GPU memory requirements.
+
+        Selects the appropriate inference engine based on the specified type
+        and returns the memory calculation result.
+
+        Args:
+            engine_type: The inference engine to use
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        engine = self._get_engine(engine_type)
+        return engine.calculate_memory()
+
+    def _get_engine(self, engine_type: InferenceEngineType) -> InferenceEngineAlias:
+        """Get the appropriate inference engine instance.
+
+        Args:
+            engine_type: The type of inference engine
+
+        Returns:
+            Engine instance configured with current settings
+        """
+        match engine_type:
+            case InferenceEngineType.HUGGINGFACE:
+                return HuggingFaceEngine(
+                    model_config=self.model_config,
+                    inference_config=self.inference_config,
+                    gpu_config=self.gpu_config,
+                )
+            case InferenceEngineType.VLLM:
+                return VLLMEngine(
+                    model_config=self.model_config,
+                    inference_config=self.inference_config,
+                    gpu_config=self.gpu_config,
+                )
+            case InferenceEngineType.TGI:
+                return TGIEngine(
+                    model_config=self.model_config,
+                    inference_config=self.inference_config,
+                    gpu_config=self.gpu_config,
+                )
+            case InferenceEngineType.TENSORRT_LLM | InferenceEngineType.TRTLLM:
+                return TensorRTLLMEngine(
+                    model_config=self.model_config,
+                    inference_config=self.inference_config,
+                    gpu_config=self.gpu_config,
+                )
+            case InferenceEngineType.SGLANG:
+                return SGLangEngine(
+                    model_config=self.model_config,
+                    inference_config=self.inference_config,
+                    gpu_config=self.gpu_config,
+                )
+            case _:
+                raise ValueError(f"Unknown inference engine type: {engine_type}")
diff --git a/src/gpu_mem_calculator/inference/huggingface.py b/src/gpu_mem_calculator/inference/huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..53f36bf1b786b267ccd45e3d9202ea98a4bc61ca
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/huggingface.py
@@ -0,0 +1,95 @@
+"""HuggingFace Transformers inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+    InferenceMemoryBreakdown,
+    InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class HuggingFaceEngine(BaseInferenceEngine):
+    """HuggingFace Transformers inference engine.
+
+    Standard HuggingFace inference with optimizations like
+    Flash Attention and torch.compile.
+    """
+
+    def calculate_memory(self) -> InferenceMemoryResult:
+        """Calculate memory requirements for HF Transformers inference.
+
+        HF Transformers memory breakdown:
+        - Model parameters: Standard loading
+        - KV cache: Standard implementation
+        - Activations: Depends on optimization level
+        - Overhead: PyTorch runtime, model loading
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        batch_size = self.inference_config.batch_size
+        tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+        # 1. Model parameters (sharded if using tensor parallel)
+        model_params_bytes = self._calculate_model_params_bytes()
+        model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+        # 2. KV cache (standard implementation)
+        kv_cache_bytes = self._calculate_kv_cache_bytes(batch_size)
+        kv_cache_gb = kv_cache_bytes / (1024**3)
+
+        # 3. Activations (standard PyTorch)
+        activations_bytes = self._calculate_hf_activations(batch_size)
+        activations_gb = activations_bytes / (1024**3)
+
+        # 4. HF overhead
+        overhead_gb = self._calculate_hf_overhead()
+
+        breakdown = InferenceMemoryBreakdown(
+            model_params_gb=model_params_per_gpu_gb,
+            kv_cache_gb=kv_cache_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_hf_activations(self, batch_size: int) -> int:
+        """Calculate activation memory for HF Transformers.
+
+        Standard PyTorch activation memory without kernel fusion.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            Activation memory in bytes
+        """
+        seq_len = self._get_effective_seq_len()
+        hidden_size = self.model_config.hidden_size
+        num_layers = self.model_config.num_layers
+
+        # Standard activation memory (forward pass only)
+        bytes_per_value = 2  # FP16/BF16
+
+        activation_bytes = batch_size * seq_len * hidden_size * num_layers * bytes_per_value
+
+        return int(activation_bytes)
+
+    def _calculate_hf_overhead(self) -> float:
+        """Calculate HF Transformers-specific overhead.
+
+        Includes:
+        - PyTorch runtime
+        - Model loading overhead
+        - Autograd graph (even for inference, some overhead remains)
+
+        Returns:
+            Overhead in GB
+        """
+        # Base PyTorch overhead: ~150MB
+        base_overhead_gb = 0.15
+
+        # Model loading overhead: ~50MB
+        loading_overhead_gb = 0.05
+
+        return base_overhead_gb + loading_overhead_gb
diff --git a/src/gpu_mem_calculator/inference/sglang.py b/src/gpu_mem_calculator/inference/sglang.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3334ca431e7d0879300c3edc180c8d559f54d4
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/sglang.py
@@ -0,0 +1,200 @@
+"""SGLang inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+    InferenceMemoryBreakdown,
+    InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class SGLangEngine(BaseInferenceEngine):
+    """SGLang inference engine with RadixAttention memory management.
+
+    SGLang uses RadixAttention to efficiently manage KV cache memory
+    with tree-based cache sharing and chunked prefill.
+    """
+
+    def calculate_memory(self) -> InferenceMemoryResult:
+        """Calculate memory requirements for SGLang inference.
+
+        SGLang memory breakdown:
+        - Model parameters: Loaded once, shared across all GPUs
+        - KV cache: Managed with RadixAttention (tree-based sharing)
+        - Activations: Temporary during forward pass
+        - Overhead: Scheduler, RadixCache tree, worker overhead
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        batch_size = self.inference_config.batch_size
+        tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+        # 1. Model parameters (sharded across tensor parallel GPUs)
+        model_params_bytes = self._calculate_model_params_bytes()
+        model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+        # 2. KV cache with RadixAttention
+        kv_cache_bytes = self._calculate_sglang_kv_cache(batch_size)
+        kv_cache_gb = kv_cache_bytes / (1024**3)
+
+        # 3. Activations (temporary, per batch)
+        activations_bytes = self._calculate_activations(batch_size)
+        activations_gb = activations_bytes / (1024**3)
+
+        # 4. SGLang overhead (RadixCache, scheduler, etc.)
+        overhead_gb = self._calculate_sglang_overhead()
+
+        breakdown = InferenceMemoryBreakdown(
+            model_params_gb=model_params_per_gpu_gb,
+            kv_cache_gb=kv_cache_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_sglang_kv_cache(self, batch_size: int) -> int:
+        """Calculate KV cache memory for SGLang with RadixAttention.
+
+        SGLang's RadixAttention uses a tree-based structure for KV cache
+        sharing, which is more memory-efficient than traditional approaches.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            KV cache memory in bytes
+        """
+        if not self.inference_config.use_kv_cache:
+            return 0
+
+        # Use chunk size for memory estimation (default: 8192)
+        chunk_size = self.inference_config.chunk_size or 8192
+
+        # Calculate effective sequence length
+        seq_len = self._get_effective_seq_len()
+
+        # RadixCache shares common prefixes across requests
+        # This provides significant memory savings for concurrent requests
+        max_running_requests = self.inference_config.max_running_requests or batch_size * 4
+
+        # Base KV cache memory
+        kv_bytes_per_token = self._get_kv_cache_bytes_per_token()
+
+        # RadixCache provides ~30% memory savings from prefix sharing
+        cache_sharing_factor = 0.7 if not self.inference_config.disable_radix_cache else 1.0
+
+        # Calculate total tokens with chunking
+        total_tokens = batch_size * min(seq_len, chunk_size)
+
+        # Apply RadixCache sharing factor
+        total_kv_bytes = total_tokens * kv_bytes_per_token * cache_sharing_factor
+
+        # Account for max running requests (concurrent requests share cache)
+        concurrent_requests_factor = min(max_running_requests / batch_size, 2.0)
+        total_kv_bytes = int(total_kv_bytes * concurrent_requests_factor)
+
+        return total_kv_bytes
+
+    def _calculate_activations(self, batch_size: int) -> int:
+        """Calculate activation memory for SGLang.
+
+        SGLang uses chunked prefill and optimized attention kernels
+        to reduce activation memory.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            Activation memory in bytes
+        """
+        seq_len = self._get_effective_seq_len()
+        chunk_size = self.inference_config.chunk_size or 8192
+
+        # Chunked prefill processes sequences in chunks
+        effective_seq_len = min(seq_len, chunk_size)
+
+        hidden_size = self.model_config.hidden_size
+        num_layers = self.model_config.num_layers
+
+        # Base activation memory
+        bytes_per_value = 2  # FP16/BF16
+
+        activation_bytes = (
+            batch_size
+            * effective_seq_len
+            * hidden_size
+            * num_layers
+            * bytes_per_value
+            * 2  # Forward pass only (no backward)
+        )
+
+        # SGLang optimizations:
+        # - Chunked prefill: ~40% reduction
+        # - FlashInfer/triton kernels: ~30% reduction
+        # - torch.compile if enabled: ~20% additional reduction
+        activation_bytes = int(activation_bytes * 0.6)  # 40% reduction from chunked prefill
+
+        if self.inference_config.enable_torch_compile:
+            activation_bytes = int(activation_bytes * 0.8)  # Additional 20% reduction
+
+        return activation_bytes
+
+    def _calculate_sglang_overhead(self) -> float:
+        """Calculate SGLang-specific overhead.
+
+        Includes:
+        - RadixCache tree structure
+        - Scheduler memory
+        - P2P communication buffers (if enabled)
+        - Custom all-reduce buffers (if not disabled)
+        - Multi-LoRA serving overhead (if enabled)
+
+        Returns:
+            Overhead in GB
+        """
+        # Base overhead: ~100-200MB for scheduler and RadixCache tree
+        base_overhead_gb = 0.15
+
+        # RadixCache tree structure overhead
+        # Tree nodes: ~32 bytes per node for metadata
+        if not self.inference_config.disable_radix_cache:
+            seq_len = self._get_effective_seq_len()
+            batch_size = self.inference_config.batch_size
+            num_nodes = batch_size * seq_len // 100  # Rough estimate
+            tree_overhead_gb = (num_nodes * 32) / (1024**3)
+        else:
+            tree_overhead_gb = 0.0
+
+        # P2P attention overhead (if enabled)
+        if self.inference_config.enable_p2p:
+            p2p_overhead_gb = 0.1  # ~100MB for P2P buffers
+        else:
+            p2p_overhead_gb = 0.0
+
+        # Custom all-reduce overhead (if not disabled)
+        if not self.inference_config.disable_custom_all_reduce:
+            all_reduce_overhead_gb = 0.08  # ~80MB for all-reduce buffers
+        else:
+            all_reduce_overhead_gb = 0.0
+
+        # Multi-LoRA serving overhead
+        if self.inference_config.multi_lora_enabled:
+            lora_overhead_gb = 0.2  # ~200MB for LoRA adapters
+        else:
+            lora_overhead_gb = 0.0
+
+        # Speculative decoding overhead
+        if self.inference_config.speculative_algo != "default":
+            speculate_overhead_gb = 0.15  # ~150MB for draft models
+        else:
+            speculate_overhead_gb = 0.0
+
+        return (
+            base_overhead_gb
+            + tree_overhead_gb
+            + p2p_overhead_gb
+            + all_reduce_overhead_gb
+            + lora_overhead_gb
+            + speculate_overhead_gb
+        )
diff --git a/src/gpu_mem_calculator/inference/tensorrt_llm.py b/src/gpu_mem_calculator/inference/tensorrt_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e9e410001d2f05e453b625f62ba6322b3847e4
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/tensorrt_llm.py
@@ -0,0 +1,104 @@
+"""TensorRT-LLM inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+    InferenceMemoryBreakdown,
+    InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class TensorRTLLMEngine(BaseInferenceEngine):
+    """TensorRT-LLM inference engine with optimized inference kernels.
+
+    TensorRT-LLM provides highly optimized inference through:
+    - Weight-only quantization (INT4/INT8)
+    - Fused attention kernels
+    - In-flight batching
+    - Custom CUDA kernels
+    """
+
+    def calculate_memory(self) -> InferenceMemoryResult:
+        """Calculate memory requirements for TensorRT-LLM inference.
+
+        TensorRT-LLM memory breakdown:
+        - Model parameters: Can be quantized (INT4/INT8/FP8)
+        - KV cache: With INT8 quantization support
+        - Activations: Minimal with fused kernels
+        - Overhead: TensorRT runtime, engine workspace
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        batch_size = self.inference_config.batch_size
+        tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+        # 1. Model parameters (quantized options available)
+        model_params_bytes = self._calculate_model_params_bytes()
+        model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+        # 2. KV cache (INT8 optimized)
+        kv_cache_bytes = self._calculate_kv_cache_bytes(batch_size)
+        kv_cache_gb = kv_cache_bytes / (1024**3)
+
+        # 3. Activations (minimal with fused kernels)
+        activations_bytes = self._calculate_tensorrt_activations(batch_size)
+        activations_gb = activations_bytes / (1024**3)
+
+        # 4. TensorRT-LLM overhead
+        overhead_gb = self._calculate_tensorrt_overhead()
+
+        breakdown = InferenceMemoryBreakdown(
+            model_params_gb=model_params_per_gpu_gb,
+            kv_cache_gb=kv_cache_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_tensorrt_activations(self, batch_size: int) -> int:
+        """Calculate activation memory for TensorRT-LLM.
+
+        TensorRT-LLM uses heavily fused kernels which minimize
+        activation memory.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            Activation memory in bytes
+        """
+        seq_len = self._get_effective_seq_len()
+        hidden_size = self.model_config.hidden_size
+        num_layers = self.model_config.num_layers
+
+        # TensorRT-LLM fuses many operations, reducing activation memory
+        # Rough estimate: ~30% of standard activation memory
+        bytes_per_value = 2  # FP16/BF16
+
+        activation_bytes = batch_size * seq_len * hidden_size * num_layers * bytes_per_value * 0.3
+
+        return int(activation_bytes)
+
+    def _calculate_tensorrt_overhead(self) -> float:
+        """Calculate TensorRT-LLM-specific overhead.
+
+        Includes:
+        - TensorRT runtime
+        - Engine workspace for temporary buffers
+        - In-flight batching bookkeeping
+        - Custom kernel overhead
+
+        Returns:
+            Overhead in GB
+        """
+        # TensorRT runtime: ~100MB
+        runtime_overhead_gb = 0.1
+
+        # Engine workspace: scales with model size
+        workspace_overhead_gb = 0.2
+
+        # In-flight batching structures
+        batching_overhead_gb = 0.05
+
+        return runtime_overhead_gb + workspace_overhead_gb + batching_overhead_gb
diff --git a/src/gpu_mem_calculator/inference/tgi.py b/src/gpu_mem_calculator/inference/tgi.py
new file mode 100644
index 0000000000000000000000000000000000000000..21305d5b2f45913a4b0b7a7885d7b88d1b068f7f
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/tgi.py
@@ -0,0 +1,109 @@
+"""TGI (Text Generation Inference) engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+    InferenceMemoryBreakdown,
+    InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class TGIEngine(BaseInferenceEngine):
+    """Text Generation Inference (TGI) engine by HuggingFace.
+
+    TGI is a production-ready inference server with optimized
+    attention mechanisms and memory management.
+    """
+
+    def calculate_memory(self) -> InferenceMemoryResult:
+        """Calculate memory requirements for TGI inference.
+
+        TGI memory breakdown:
+        - Model parameters: Loaded in specified dtype
+        - KV cache: With optional quantization (INT8/FP8)
+        - Activations: Flash Attention optimized
+        - Overhead: TGI server, router, preallocation
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        batch_size = self.inference_config.batch_size
+        tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+        # 1. Model parameters (sharded across tensor parallel GPUs)
+        model_params_bytes = self._calculate_model_params_bytes()
+        model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+        # 2. KV cache (with quantization support)
+        kv_cache_bytes = self._calculate_kv_cache_bytes(batch_size)
+        kv_cache_gb = kv_cache_bytes / (1024**3)
+
+        # 3. Activations (Flash Attention optimized)
+        activations_bytes = self._calculate_tgi_activations(batch_size)
+        activations_gb = activations_bytes / (1024**3)
+
+        # 4. TGI overhead
+        overhead_gb = self._calculate_tgi_overhead()
+
+        breakdown = InferenceMemoryBreakdown(
+            model_params_gb=model_params_per_gpu_gb,
+            kv_cache_gb=kv_cache_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_tgi_activations(self, batch_size: int) -> int:
+        """Calculate activation memory for TGI.
+
+        TGI uses Flash Attention which significantly reduces
+        activation memory by materializing less attention matrices.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            Activation memory in bytes
+        """
+        seq_len = self._get_effective_seq_len()
+        hidden_size = self.model_config.hidden_size
+        num_layers = self.model_config.num_layers
+
+        # Flash Attention reduces memory by not materializing full attention matrix
+        # Rough estimate for activation memory
+        bytes_per_value = 2  # FP16/BF16
+
+        # TGI uses optimized kernels: ~40% of standard activation memory
+        activation_bytes = batch_size * seq_len * hidden_size * num_layers * bytes_per_value * 0.4
+
+        return int(activation_bytes)
+
+    def _calculate_tgi_overhead(self) -> float:
+        """Calculate TGI-specific overhead.
+
+        Includes:
+        - TGI server and router
+        - nccl communication for tensor parallel
+        - Preallocated buffers for efficiency
+        - Dynamic batching overhead
+
+        Returns:
+            Overhead in GB
+        """
+        # Base server overhead: ~200MB
+        base_overhead_gb = 0.2
+
+        # Tensor parallel communication overhead
+        if self.inference_config.tensor_parallel_size > 1:
+            # nccl overhead scales with TP size
+            tp_overhead_gb = self.inference_config.tensor_parallel_size * 0.05
+        else:
+            tp_overhead_gb = 0.0
+
+        # Dynamic batching bookkeeping
+        batch_overhead_gb = 0.05
+
+        # Preallocated buffers for Flash Attention
+        buffer_overhead_gb = 0.1
+
+        return base_overhead_gb + tp_overhead_gb + batch_overhead_gb + buffer_overhead_gb
diff --git a/src/gpu_mem_calculator/inference/vllm.py b/src/gpu_mem_calculator/inference/vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ad11fb98b6dfd904b97c310c88098be9308f8f
--- /dev/null
+++ b/src/gpu_mem_calculator/inference/vllm.py
@@ -0,0 +1,150 @@
+"""vLLM inference engine memory calculation."""
+
+from gpu_mem_calculator.core.models import (
+    InferenceMemoryBreakdown,
+    InferenceMemoryResult,
+)
+from gpu_mem_calculator.inference.base import BaseInferenceEngine
+
+
+class VLLMEngine(BaseInferenceEngine):
+    """vLLM inference engine with PagedAttention memory management.
+
+    vLLM uses PagedAttention to efficiently manage KV cache memory
+    with block-based allocation.
+    """
+
+    def calculate_memory(self) -> InferenceMemoryResult:
+        """Calculate memory requirements for vLLM inference.
+
+        vLLM memory breakdown:
+        - Model parameters: Loaded once, shared across all GPUs
+        - KV cache: Managed in blocks with PagedAttention
+        - Activations: Temporary during forward pass
+        - Overhead: vLLM scheduler, worker overhead, block tables
+
+        Returns:
+            InferenceMemoryResult with complete memory breakdown
+        """
+        batch_size = self.inference_config.batch_size
+        tensor_parallel_size = self.inference_config.tensor_parallel_size
+
+        # 1. Model parameters (sharded across tensor parallel GPUs)
+        model_params_bytes = self._calculate_model_params_bytes()
+        model_params_per_gpu_gb = (model_params_bytes / tensor_parallel_size) / (1024**3)
+
+        # 2. KV cache with PagedAttention (block-based allocation)
+        kv_cache_bytes = self._calculate_vllm_kv_cache(batch_size)
+        kv_cache_gb = kv_cache_bytes / (1024**3)
+
+        # 3. Activations (temporary, per batch)
+        activations_bytes = self._calculate_activations(batch_size)
+        activations_gb = activations_bytes / (1024**3)
+
+        # 4. vLLM overhead (scheduler, block manager, etc.)
+        overhead_gb = self._calculate_vllm_overhead()
+
+        breakdown = InferenceMemoryBreakdown(
+            model_params_gb=model_params_per_gpu_gb,
+            kv_cache_gb=kv_cache_gb,
+            activations_gb=activations_gb,
+            overhead_gb=overhead_gb,
+        )
+
+        return self._create_result(breakdown)
+
+    def _calculate_vllm_kv_cache(self, batch_size: int) -> int:
+        """Calculate KV cache memory for vLLM with PagedAttention.
+
+        vLLM uses block-based KV cache management, which is more efficient
+        than contiguous allocation. Each block contains multiple token slots.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            KV cache memory in bytes
+        """
+        block_size = self.inference_config.block_size or 16
+
+        # Calculate total tokens needed
+        seq_len = self._get_effective_seq_len()
+        total_tokens = batch_size * seq_len
+
+        # Calculate number of blocks needed
+        num_blocks = (total_tokens + block_size - 1) // block_size
+
+        # Add 20% buffer for dynamic allocation during generation
+        num_blocks = int(num_blocks * 1.2)
+
+        # KV cache memory with block allocation
+        kv_bytes_per_token = self._get_kv_cache_bytes_per_token()
+        total_kv_bytes = num_blocks * block_size * kv_bytes_per_token
+
+        return total_kv_bytes
+
+    def _calculate_activations(self, batch_size: int) -> int:
+        """Calculate activation memory for vLLM.
+
+        vLLM optimizes activation memory with kernel fusion
+        and efficient attention implementation.
+
+        Args:
+            batch_size: Batch size
+
+        Returns:
+            Activation memory in bytes
+        """
+        seq_len = self._get_effective_seq_len()
+        hidden_size = self.model_config.hidden_size
+        num_layers = self.model_config.num_layers
+
+        # Simplified activation calculation
+        # vLLM uses kernel fusion to reduce activation memory
+        # Rough estimate: batch * seq * hidden * layers * bytes_per_value
+        bytes_per_value = 2  # FP16/BF16
+
+        # Base activation memory
+        activation_bytes = (
+            batch_size
+            * seq_len
+            * hidden_size
+            * num_layers
+            * bytes_per_value
+            * 2  # Forward pass only (no backward)
+        )
+
+        # vLLM optimization: ~50% reduction with kernel fusion
+        activation_bytes = int(activation_bytes * 0.5)
+
+        return activation_bytes
+
+    def _calculate_vllm_overhead(self) -> float:
+        """Calculate vLLM-specific overhead.
+
+        Includes:
+        - Scheduler memory
+        - Block table management
+        - Worker process overhead
+        - CUDA graphs and preallocated buffers
+
+        Returns:
+            Overhead in GB
+        """
+        # Base overhead: ~100-200MB for scheduler and block manager
+        base_overhead_gb = 0.15
+
+        # Additional overhead for block tables
+        # Each block entry: 8 bytes (pointer)
+        block_size = self.inference_config.block_size or 16
+        seq_len = self._get_effective_seq_len()
+        batch_size = self.inference_config.batch_size
+
+        num_blocks = (batch_size * seq_len + block_size - 1) // block_size * 1.2
+        block_table_bytes = num_blocks * 8
+        block_table_gb = block_table_bytes / (1024**3)
+
+        # Preallocated buffers for CUDA kernels (~50MB)
+        buffer_overhead_gb = 0.05
+
+        return base_overhead_gb + block_table_gb + buffer_overhead_gb
diff --git a/src/gpu_mem_calculator/py.typed b/src/gpu_mem_calculator/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/gpu_mem_calculator/utils/__init__.py b/src/gpu_mem_calculator/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f9e9765313e6ef76edb438a70b9571b607788b
--- /dev/null
+++ b/src/gpu_mem_calculator/utils/__init__.py
@@ -0,0 +1,5 @@
+"""Utility functions."""
+
+from gpu_mem_calculator.utils.precision import Precision, get_precision_from_dtype
+
+__all__ = ["Precision", "get_precision_from_dtype"]
diff --git a/src/gpu_mem_calculator/utils/__pycache__/__init__.cpython-312.pyc b/src/gpu_mem_calculator/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e95827a345cf5edf0fe115ea57c4aaf1e68a5c
Binary files /dev/null and b/src/gpu_mem_calculator/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/utils/__pycache__/precision.cpython-312.pyc b/src/gpu_mem_calculator/utils/__pycache__/precision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc8bb7fe7036e3ef35a15e239cffcfb5390f4fea
Binary files /dev/null and b/src/gpu_mem_calculator/utils/__pycache__/precision.cpython-312.pyc differ
diff --git a/src/gpu_mem_calculator/utils/precision.py b/src/gpu_mem_calculator/utils/precision.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a96cbe599bbc5c853756241e67e00207527e56a
--- /dev/null
+++ b/src/gpu_mem_calculator/utils/precision.py
@@ -0,0 +1,83 @@
+"""Precision and data type utilities."""
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Precision:
+    """Precision information for a data type."""
+
+    name: str
+    bits_per_param: int
+    bytes_per_param: float
+    is_integer: bool = False
+
+
+# Standard precision definitions
+PRECISION_MAP = {
+    "fp32": Precision(name="FP32", bits_per_param=32, bytes_per_param=4.0),
+    "fp16": Precision(name="FP16", bits_per_param=16, bytes_per_param=2.0),
+    "bf16": Precision(name="BF16", bits_per_param=16, bytes_per_param=2.0),
+    "int8": Precision(name="INT8", bits_per_param=8, bytes_per_param=1.0, is_integer=True),
+    "int4": Precision(name="INT4", bits_per_param=4, bytes_per_param=0.5, is_integer=True),
+}
+
+
+def get_precision_from_dtype(dtype: str) -> Precision:
+    """Get precision info from dtype string.
+
+    Args:
+        dtype: Data type string (e.g., "fp32", "fp16", "bf16", "int8", "int4")
+
+    Returns:
+        Precision object with bytes per parameter information
+
+    Raises:
+        ValueError: If dtype is not supported
+    """
+    dtype_lower = dtype.lower()
+    if dtype_lower not in PRECISION_MAP:
+        raise ValueError(
+            f"Unsupported dtype: {dtype}. Supported types: {list(PRECISION_MAP.keys())}"
+        )
+    return PRECISION_MAP[dtype_lower]
+
+
+def bytes_from_params(num_params: int, dtype: str) -> float:
+    """Calculate memory in bytes for a given number of parameters.
+
+    Args:
+        num_params: Number of parameters
+        dtype: Data type string
+
+    Returns:
+        Memory in bytes
+    """
+    precision = get_precision_from_dtype(dtype)
+    return num_params * precision.bytes_per_param
+
+
+def gb_from_bytes(num_bytes: float) -> float:
+    """Convert bytes to gigabytes.
+
+    Args:
+        num_bytes: Number of bytes
+
+    Returns:
+        Number of gigabytes
+    """
+    return num_bytes / (1024**3)
+
+
+def gb_from_params(num_params: int, dtype: str) -> float:
+    """Calculate memory in GB for a given number of parameters.
+
+    Args:
+        num_params: Number of parameters
+        dtype: Data type string
+
+    Returns:
+        Memory in GB
+    """
+    bytes_val = bytes_from_params(num_params, dtype)
+    return gb_from_bytes(bytes_val)
diff --git a/web/__init__.py b/web/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26048ff005fcd993b241f5ec241630ea36ff38b
--- /dev/null
+++ b/web/__init__.py
@@ -0,0 +1 @@
+"""Web application for GPU Memory Calculator."""
diff --git a/web/__pycache__/__init__.cpython-312.pyc b/web/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..477381308b81a89e9bbe50f65ccfbdd762d178d2
Binary files /dev/null and b/web/__pycache__/__init__.cpython-312.pyc differ
diff --git a/web/__pycache__/app.cpython-312.pyc b/web/__pycache__/app.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b76f75481ba22d3b1a8f8dd76ff94343f4d997f6
Binary files /dev/null and b/web/__pycache__/app.cpython-312.pyc differ
diff --git a/web/app.py b/web/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..66fc48171a4f13ee69a14fce7bb348b95b68bb88
--- /dev/null
+++ b/web/app.py
@@ -0,0 +1,1161 @@
+"""FastAPI backend for GPU Memory Calculator web application."""
+
+import hashlib
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel, Field, field_validator, model_validator
+from starlette.requests import Request
+
+from gpu_mem_calculator.config.presets import load_presets
+from gpu_mem_calculator.core.calculator import GPUMemoryCalculator
+from gpu_mem_calculator.core.models import (
+    EngineConfig,
+    GPUConfig,
+    InferenceConfig,
+    InferenceEngineType,
+    InterconnectType,
+    MemoryResult,
+    ModelConfig,
+    NodeConfig,
+    ParallelismConfig,
+    TrainingConfig,
+)
+from gpu_mem_calculator.core.multinode import MultiNodeCalculator
+from gpu_mem_calculator.exporters.manager import ExportFormat, ExportManager
+from gpu_mem_calculator.inference.calculator import InferenceMemoryCalculator
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Create FastAPI app
+app = FastAPI(
+    title="GPU Memory Calculator",
+    description="Calculate GPU memory requirements for LLM training",
+    version="0.1.0",
+)
+
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Setup templates and static files
+BASE_DIR = Path(__file__).parent
+templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
+
+# Mount static files
+static_dir = BASE_DIR / "static"
+if static_dir.exists():
+    app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
+
+
+# Request/Response models
+class CalculateRequest(BaseModel):
+    """Request model for memory calculation with comprehensive validation."""
+
+    model: dict[str, Any] = Field(description="Model configuration")
+    training: dict[str, Any] = Field(description="Training configuration")
+    parallelism: dict[str, Any] | None = Field(
+        default=None,
+        description="Parallelism configuration",
+    )
+    engine: dict[str, Any] | None = Field(default=None, description="Engine configuration")
+    hardware: dict[str, Any] | None = Field(default=None, description="Hardware configuration")
+
+    @field_validator("model")
+    @classmethod
+    def validate_moe_settings(cls, v: dict[str, Any]) -> dict[str, Any]:
+        """Validate MoE-specific constraints."""
+        if v.get("moe_enabled"):
+            num_experts = v.get("num_experts", 1)
+            top_k = v.get("top_k", 1)
+
+            if top_k > num_experts:
+                raise ValueError(f"MoE top_k ({top_k}) cannot exceed num_experts ({num_experts})")
+
+            if num_experts < 1 or num_experts > 256:
+                raise ValueError(f"num_experts must be between 1 and 256, got {num_experts}")
+
+            if top_k < 1 or top_k > 8:
+                raise ValueError(f"top_k must be between 1 and 8, got {top_k}")
+
+        return v
+
+    @model_validator(mode="after")
+    def validate_parallelism_consistency(self) -> "CalculateRequest":
+        """Validate parallelism settings consistency."""
+        if self.parallelism and self.hardware:
+            tensor_pp = self.parallelism.get("tensor_parallel_size", 1)
+            pipeline_pp = self.parallelism.get("pipeline_parallel_size", 1)
+            data_pp = self.parallelism.get("data_parallel_size", 1)
+            num_gpus = self.hardware.get("num_gpus", 1)
+
+            effective_gpus = tensor_pp * pipeline_pp * data_pp
+
+            if effective_gpus != num_gpus:
+                raise ValueError(
+                    f"Parallelism mismatch: tensor_pp ({tensor_pp}) × "
+                    f"pipeline_pp ({pipeline_pp}) × data_pp ({data_pp}) = "
+                    f"{effective_gpus} GPUs, but num_gpus is set to {num_gpus}. "
+                    f"These must match."
+                )
+
+        # Validate sequence parallel requires tensor parallel > 1
+        if self.parallelism and self.parallelism.get("sequence_parallel"):
+            tensor_pp = self.parallelism.get("tensor_parallel_size", 1)
+            if tensor_pp <= 1:
+                raise ValueError(
+                    f"Sequence parallelism requires tensor_parallel_size > 1, " f"got {tensor_pp}"
+                )
+
+        return self
+
+    @model_validator(mode="after")
+    def validate_engine_settings(self) -> "CalculateRequest":
+        """Validate engine-specific settings."""
+        if not self.engine:
+            return self
+
+        engine_type = self.engine.get("type")
+        zero_stage = self.engine.get("zero_stage", 0)
+
+        # ZeRO stages only valid for DeepSpeed engines
+        if engine_type not in ["deepspeed", "megatron_deepspeed"] and zero_stage > 0:
+            raise ValueError(
+                f"ZeRO stages are only supported for DeepSpeed engines, "
+                f"got engine_type='{engine_type}' with zero_stage={zero_stage}"
+            )
+
+        # Validate ZeRO stage range
+        if zero_stage < 0 or zero_stage > 3:
+            raise ValueError(f"zero_stage must be between 0 and 3, got {zero_stage}")
+
+        return self
+
+
+class PresetInfo(BaseModel):
+    """Information about a preset model configuration."""
+
+    name: str
+    display_name: str
+    description: str
+    config: dict[str, Any]
+
+
+# Simple in-memory cache for calculation results
+# In production, use Redis or similar
+_calculation_cache: dict[str, tuple[MemoryResult, float]] = {}  # key -> (result, timestamp)
+_CACHE_TTL = 3600  # 1 hour
+_MAX_CACHE_SIZE = 1000
+
+
+def _cache_key_from_request(request: CalculateRequest) -> str:
+    """Generate cache key from request."""
+    request_dict = request.model_dump()
+    # Sort keys for consistent hashing
+    request_str = json.dumps(request_dict, sort_keys=True)
+    return hashlib.md5(request_str.encode()).hexdigest()
+
+
+def _get_cached_result(key: str) -> MemoryResult | None:
+    """Get cached result if available and not expired."""
+    if key in _calculation_cache:
+        result, timestamp = _calculation_cache[key]
+        import time
+
+        if time.time() - timestamp < _CACHE_TTL:
+            return result
+        else:
+            # Expired, remove from cache
+            del _calculation_cache[key]
+    return None
+
+
+def _cache_result(key: str, result: MemoryResult) -> None:
+    """Cache calculation result."""
+    import time
+
+    # Simple cache eviction if too large
+    if len(_calculation_cache) >= _MAX_CACHE_SIZE:
+        # Remove oldest entry (first key)
+        oldest_key = next(iter(_calculation_cache))
+        del _calculation_cache[oldest_key]
+
+    _calculation_cache[key] = (result, time.time())
+
+
+# Load presets at startup using shared preset loader
+# The shared loader reads from web/presets/models.json
+def _load_presets_from_shared() -> dict[str, PresetInfo]:
+    """Load presets using the shared preset loader."""
+    all_presets = load_presets()
+    return {
+        name: PresetInfo(
+            name=name,
+            display_name=preset.get("display_name", name),
+            description=preset.get("description", ""),
+            config=preset.get("config", {}),
+        )
+        for name, preset in all_presets.items()
+    }
+
+
+PRESETS = _load_presets_from_shared()
+
+
+# API Routes
+@app.get("/")
+async def index(request: Request) -> Any:
+    """Serve the main web page."""
+    return templates.TemplateResponse("index.html", {"request": request})
+
+
+@app.get("/api/engines")
+async def list_engines() -> dict[str, str]:
+    """List supported training engines."""
+    return {
+        "pytorch_ddp": "PyTorch DDP (Distributed Data Parallel)",
+        "deepspeed": "DeepSpeed ZeRO",
+        "megatron_lm": "Megatron-LM",
+        "fsdp": "PyTorch FSDP (Fully Sharded Data Parallel)",
+        "megatron_deepspeed": "Megatron-LM + DeepSpeed",
+    }
+
+
+@app.get("/api/optimizers")
+async def list_optimizers() -> dict[str, str]:
+    """List supported optimizers."""
+    return {
+        "adam": "Adam",
+        "adamw": "AdamW",
+        "adamw_8bit": "AdamW 8-bit",
+        "sgd": "SGD",
+    }
+
+
+@app.get("/api/dtypes")
+async def list_dtypes() -> dict[str, str]:
+    """List supported data types."""
+    return {
+        "fp32": "FP32 (32-bit floating point)",
+        "fp16": "FP16 (16-bit floating point)",
+        "bf16": "BF16 (16-bit bfloat)",
+        "int8": "INT8 (8-bit integer)",
+        "int4": "INT4 (4-bit integer)",
+    }
+
+
+@app.get("/api/presets")
+async def list_presets() -> dict[str, dict[str, str]]:
+    """List all preset model configurations."""
+    return {
+        name: {
+            "display_name": preset.display_name,
+            "description": preset.description,
+        }
+        for name, preset in PRESETS.items()
+    }
+
+
+@app.get("/api/preset/{preset_name}")
+async def get_preset(preset_name: str) -> dict[str, Any]:
+    """Get a specific preset configuration."""
+    if preset_name not in PRESETS:
+        raise HTTPException(status_code=404, detail=f"Preset '{preset_name}' not found")
+
+    return PRESETS[preset_name].config
+
+
+@app.post("/api/calculate")
+async def calculate_memory(request: CalculateRequest) -> MemoryResult:
+    """Calculate GPU memory requirements.
+
+    Args:
+        request: Calculation request with model, training, and hardware configs
+
+    Returns:
+        MemoryResult with complete memory breakdown
+    """
+    # Check cache first
+    cache_key = _cache_key_from_request(request)
+    cached_result = _get_cached_result(cache_key)
+    if cached_result is not None:
+        logger.info(f"Cache hit for key: {cache_key[:8]}...")
+        return cached_result
+
+    try:
+        # Parse model configuration
+        model_data = request.model.copy()
+        # Parse num_parameters if it's a string (e.g., "7B", "7000M")
+        if "num_parameters" in model_data and isinstance(
+            model_data["num_parameters"],
+            str,
+        ):
+            from gpu_mem_calculator.config.parser import ConfigParser
+
+            model_data["num_parameters"] = ConfigParser._parse_num_params(
+                model_data["num_parameters"],
+            )
+
+        model_config = ModelConfig(**model_data)
+
+        # Parse training configuration
+        training_config = TrainingConfig(**request.training)
+
+        # Parse optional configurations with defaults
+        parallelism_config = (
+            ParallelismConfig(**request.parallelism) if request.parallelism else ParallelismConfig()
+        )
+
+        engine_config = EngineConfig(**request.engine) if request.engine else EngineConfig()
+
+        gpu_config = GPUConfig(**request.hardware) if request.hardware else GPUConfig()
+
+        # Create calculator and compute
+        calculator = GPUMemoryCalculator(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            gpu_config=gpu_config,
+        )
+
+        result = calculator.calculate()
+
+        # Cache the result
+        _cache_result(cache_key, result)
+
+        logger.info(
+            f"Calculation successful: {model_config.name}, "
+            f"{result.total_memory_per_gpu_gb:.2f} GB per GPU"
+        )
+
+        return result
+
+    except ValueError as e:
+        # User input validation error
+        logger.warning(f"Validation error: {str(e)}")
+        raise HTTPException(
+            status_code=400,
+            detail={"error": "Validation error", "message": str(e), "type": "validation_error"},
+        ) from e
+    except Exception as e:
+        # Unexpected system error
+        logger.error(f"Calculation error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "Internal server error",
+                "message": "An unexpected error occurred during calculation",
+            },
+        ) from e
+
+
+@app.post("/api/export/deepspeed")
+async def export_deepspeed_config(request: CalculateRequest) -> dict[str, Any]:
+    """Export DeepSpeed configuration file.
+
+    Args:
+        request: Calculation request with model, training, and hardware configs
+
+    Returns:
+        DeepSpeed config JSON and memory result
+    """
+    try:
+        # First calculate memory
+        calc_result = await calculate_memory(request)
+
+        # Generate DeepSpeed config
+        parallelism = request.parallelism or {}
+        training = request.training
+        engine = request.engine or {}
+
+        train_batch_size = (
+            training.get("batch_size", 1)
+            * training.get("gradient_accumulation_steps", 1)
+            * parallelism.get("data_parallel_size", 1)
+        )
+
+        zero_stage = engine.get("zero_stage", 0)
+        offload_optimizer = engine.get("offload_optimizer", "none")
+        offload_param = engine.get("offload_param", "none")
+
+        deepspeed_config = {
+            "train_batch_size": train_batch_size,
+            "train_micro_batch_size_per_gpu": training.get("batch_size", 1),
+            "gradient_accumulation_steps": training.get("gradient_accumulation_steps", 1),
+            "optimizer": {
+                "type": training.get("optimizer", "AdamW"),
+                "params": {"lr": 0.0001, "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 0.01},
+            },
+            "scheduler": {
+                "type": "WarmupLR",
+                "params": {"warmup_min_lr": 0, "warmup_max_lr": 0.0001, "warmup_num_steps": 2000},
+            },
+            "fp16": {"enabled": training.get("dtype") in ["fp16", "int4", "int8"]},
+            "bf16": {"enabled": training.get("dtype") == "bf16"},
+            "zero_optimization": {"stage": zero_stage},
+            "gradient_clipping": training.get("gradient_clipping", 1.0),
+            "steps_per_print": 100,
+        }
+
+        # Add offload config if ZeRO stage >= 1
+        if zero_stage >= 1:
+            deepspeed_config["zero_optimization"]["offload_optimizer"] = {
+                "device": offload_optimizer
+            }
+            deepspeed_config["zero_optimization"]["offload_param"] = {"device": offload_param}
+
+        return {"config": deepspeed_config, "memory_result": calc_result}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"DeepSpeed export error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to generate DeepSpeed config: {str(e)}"
+        ) from e
+
+
+@app.post("/api/optimize/batch-size")
+async def optimize_batch_size(request: CalculateRequest) -> dict[str, Any]:
+    """Find maximum batch size that fits in GPU memory.
+
+    Uses binary search to find the maximum batch size that doesn't OOM.
+
+    Args:
+        request: Calculation request with model, training, and hardware configs
+
+    Returns:
+        Maximum batch size that fits and corresponding memory result
+    """
+    try:
+        # Create a mutable copy for testing
+        from copy import deepcopy
+
+        min_batch = 1
+        max_batch = 512  # Reasonable upper bound
+        best_batch = 1
+
+        while min_batch <= max_batch:
+            mid = (min_batch + max_batch) // 2
+
+            # Create modified request with test batch size
+            test_request = deepcopy(request)
+            test_request.training["batch_size"] = mid
+
+            try:
+                # Validate and calculate
+                CalculateRequest.model_validate(test_request)
+                result = await calculate_memory(test_request)
+
+                if result.fits_on_gpu:
+                    best_batch = mid
+                    min_batch = mid + 1
+                else:
+                    max_batch = mid - 1
+            except (ValueError, HTTPException):
+                # Invalid config or doesn't fit
+                max_batch = mid - 1
+
+        # Get final result for best batch size
+        final_request = deepcopy(request)
+        final_request.training["batch_size"] = best_batch
+        final_result = await calculate_memory(final_request)
+
+        return {"max_batch_size": best_batch, "memory_result": final_result}
+
+    except Exception as e:
+        logger.error(f"Batch size optimization error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to optimize batch size: {str(e)}"
+        ) from e
+
+
+@app.post("/api/validate")
+async def validate_config(request: CalculateRequest) -> dict[str, Any]:
+    """Validate a configuration without calculating memory.
+
+    Args:
+        request: Configuration to validate
+
+    Returns:
+        Validation result with valid flag and any errors
+    """
+    try:
+        # Pydantic validation happens automatically when creating CalculateRequest
+        # If we get here, the request is valid
+        return {"valid": True, "errors": []}
+
+    except ValueError as e:
+        # Validation error
+        return {"valid": False, "errors": [str(e)]}
+    except Exception as e:
+        # Unexpected error
+        logger.error(f"Validation error: {str(e)}", exc_info=True)
+        return {"valid": False, "errors": [str(e)]}
+
+
+@app.post("/api/explain-formula")
+async def explain_formula(request: CalculateRequest) -> dict[str, Any]:
+    """Explain the memory formula used for calculation.
+
+    Returns detailed information about which formula is being used,
+    with the user's values plugged in, and links to documentation.
+
+    Args:
+        request: Calculation request with model, training, and hardware configs
+
+    Returns:
+        Formula explanation with formula type, breakdown, and references
+    """
+    try:
+        # Get configuration details
+        engine_type = request.engine.get("type", "pytorch_ddp") if request.engine else "pytorch_ddp"
+        num_params = request.model.get("num_parameters", 0)
+
+        # Parse num_parameters if it's a string (e.g., "7B", "7000M")
+        if isinstance(num_params, str):
+            from gpu_mem_calculator.config.parser import ConfigParser
+
+            num_params = ConfigParser._parse_num_params(num_params)
+
+        optimizer = request.training.get("optimizer", "adamw")
+        num_gpus = request.hardware.get("num_gpus", 1) if request.hardware else 1
+        batch_size = request.training.get("batch_size", 1)
+
+        # Calculate memory to get the breakdown
+        result = await calculate_memory(request)
+
+        # Determine formula description based on engine type
+        formula_info = {
+            "engine_type": engine_type,
+            "engine_name": _get_engine_name(engine_type),
+            "formula_components": [],
+            "total_memory_gb": round(result.total_memory_per_gpu_gb, 2),
+            "breakdown": {
+                "model_params_gb": round(result.breakdown.model_params_gb, 2),
+                "gradients_gb": round(result.breakdown.gradients_gb, 2),
+                "optimizer_states_gb": round(result.breakdown.optimizer_states_gb, 2),
+                "activations_gb": round(result.breakdown.activations_gb, 2),
+                "overhead_gb": round(result.breakdown.overhead_gb, 2),
+            },
+            "references": _get_formula_references(engine_type),
+        }
+
+        # Add engine-specific formula details
+        if engine_type == "pytorch_ddp":
+            formula_info["formula_description"] = (
+                "PyTorch DDP stores complete copies of model parameters, gradients, "
+                "and optimizer states on each GPU."
+            )
+            formula_info["formula_components"] = [
+                {
+                    "name": "Model Parameters",
+                    "formula": f"{num_params:,} × 2 bytes (FP16/BF16)",
+                    "result": f"{result.breakdown.model_params_gb:.2f} GB",
+                    "description": "Full model stored on each GPU",
+                },
+                {
+                    "name": "Gradients",
+                    "formula": f"{num_params:,} × 2 bytes (FP16)",
+                    "result": f"{result.breakdown.gradients_gb:.2f} GB",
+                    "description": "Full gradients during backward pass",
+                },
+                {
+                    "name": "Optimizer States",
+                    "formula": _get_optimizer_formula(optimizer, num_params)["formula"],
+                    "result": f"{result.breakdown.optimizer_states_gb:.2f} GB",
+                    "description": _get_optimizer_formula(optimizer, num_params)["description"],
+                },
+            ]
+
+        elif engine_type in ["deepspeed", "megatron_deepspeed"]:
+            zero_stage = request.engine.get("zero_stage", 0) if request.engine else 0
+            offload_optimizer = (
+                request.engine.get("offload_optimizer", "none") if request.engine else "none"
+            )
+            offload_param = (
+                request.engine.get("offload_param", "none") if request.engine else "none"
+            )
+
+            if zero_stage == 0:
+                stage_name = "ZeRO-0 (Baseline)"
+                formula_info["formula_description"] = (
+                    f"{stage_name}: No memory optimization. Same as PyTorch DDP."
+                )
+            elif zero_stage == 1:
+                stage_name = "ZeRO-1"
+                formula_info["formula_description"] = (
+                    f"{stage_name}: Shards optimizer states across {num_gpus} GPUs. "
+                    f"Reduces optimizer memory by {num_gpus}x."
+                )
+            elif zero_stage == 2:
+                stage_name = "ZeRO-2"
+                formula_info["formula_description"] = (
+                    f"{stage_name}: Shards optimizer states AND gradients across {num_gpus} GPUs. "
+                    f"Reduces memory by {num_gpus}x for both components."
+                )
+            elif zero_stage == 3:
+                stage_name = "ZeRO-3"
+                formula_info["formula_description"] = (
+                    f"{stage_name}: Shards parameters, gradients, AND optimizer states. "
+                    f"Only largest layer stored intact. Linear memory reduction with GPU count."
+                )
+
+            formula_info["zero_stage"] = zero_stage
+            formula_info["offload_optimizer"] = offload_optimizer
+            formula_info["offload_param"] = offload_param
+
+            # Add ZeRO-specific components
+            if zero_stage == 3:
+                # Estimate largest layer (approx 10% of params for typical models)
+                largest_params = num_params // 10
+                formula_info["formula_components"] = [
+                    {
+                        "name": "Largest Layer",
+                        "formula": f"{largest_params:,} × 4 bytes (FP16 params + grads)",
+                        "result": f"{result.breakdown.model_params_gb:.2f} GB",
+                        "description": "Gathered during compute, largest layer kept intact",
+                    },
+                    {
+                        "name": "Sharded Parameters",
+                        "formula": f"({num_params:,} × 2 bytes) / {num_gpus} GPUs",
+                        "result": "Included in model params",
+                        "description": "Remaining parameters sharded across GPUs",
+                    },
+                    {
+                        "name": "Sharded Optimizer States",
+                        "formula": (
+                            (
+                                f"({_get_optimizer_formula(optimizer, num_params)['formula']}) "
+                                f"/ {num_gpus} GPUs"
+                            )
+                            if offload_optimizer == "none"
+                            else f"Offloaded to {offload_optimizer}"
+                        ),
+                        "result": f"{result.breakdown.optimizer_states_gb:.2f} GB",
+                        "description": (
+                            _get_optimizer_formula(optimizer, num_params)["description"]
+                            + " (sharded or offloaded)"
+                        ),
+                    },
+                ]
+            else:
+                # ZeRO-1 or ZeRO-2
+                formula_info["formula_components"] = [
+                    {
+                        "name": "Model Parameters",
+                        "formula": f"{num_params:,} × 2 bytes (FP16)",
+                        "result": f"{result.breakdown.model_params_gb:.2f} GB",
+                        "description": "Full model on each GPU",
+                    },
+                    {
+                        "name": "Gradients",
+                        "formula": (
+                            f"{num_params:,} × 2 bytes"
+                            if zero_stage < 2
+                            else f"({num_params:,} × 2 bytes) / {num_gpus} GPUs"
+                        ),
+                        "result": f"{result.breakdown.gradients_gb:.2f} GB",
+                        "description": (
+                            "Sharded across GPUs" if zero_stage >= 2 else "Full gradients"
+                        ),
+                    },
+                    {
+                        "name": "Optimizer States",
+                        "formula": (
+                            (
+                                f"({_get_optimizer_formula(optimizer, num_params)['formula']}) "
+                                f"/ {num_gpus} GPUs"
+                            )
+                            if offload_optimizer == "none"
+                            else f"Offloaded to {offload_optimizer}"
+                        ),
+                        "result": f"{result.breakdown.optimizer_states_gb:.2f} GB",
+                        "description": (
+                            _get_optimizer_formula(optimizer, num_params)["description"]
+                            + " (sharded or offloaded)"
+                        ),
+                    },
+                ]
+
+        elif engine_type == "fsdp":
+            sharding_strategy = (
+                request.engine.get("sharding_strategy", "full_shard")
+                if request.engine
+                else "full_shard"
+            )
+
+            if sharding_strategy == "no_shard":
+                strategy_name = "No Sharding (like DDP)"
+            elif sharding_strategy == "shard_grad_op":
+                strategy_name = "Shard Gradients + Optimizer (like ZeRO-2)"
+            else:
+                strategy_name = "Full Shard (like ZeRO-3)"
+
+            formula_info["sharding_strategy"] = sharding_strategy
+            formula_info["strategy_name"] = strategy_name
+            formula_info["formula_description"] = f"FSDP with {strategy_name.lower()} strategy."
+
+        elif engine_type == "megatron_lm":
+            formula_info["formula_description"] = (
+                "Megatron-LM uses tensor and/or pipeline parallelism to "
+                "split the model across GPUs, reducing memory per GPU."
+            )
+
+            # Add parallelism info
+            if request.parallelism:
+                tp_size = request.parallelism.get("tensor_parallel_size", 1)
+                pp_size = request.parallelism.get("pipeline_parallel_size", 1)
+                formula_info["parallelism"] = {
+                    "tensor_parallel_size": tp_size,
+                    "pipeline_parallel_size": pp_size,
+                }
+
+        # Add activation memory explanation
+        components: list[dict[str, Any]] = formula_info["formula_components"]  # type: ignore[assignment]
+        components.append(
+            {
+                "name": "Activations",
+                "formula": (
+                    f"batch_size({batch_size}) × seq_len × hidden_size × "
+                    f"layers × ~16 bytes/token/layer"
+                ),
+                "result": f"{result.breakdown.activations_gb:.2f} GB",
+                "description": "Memory from intermediate activations during forward/backward pass",
+            }
+        )
+
+        return formula_info
+
+    except Exception as e:
+        logger.error(f"Formula explanation error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to generate formula explanation: {str(e)}"
+        ) from e
+
+
+def _get_engine_name(engine_type: str) -> str:
+    """Get human-readable engine name."""
+    names = {
+        "pytorch_ddp": "PyTorch DDP (Distributed Data Parallel)",
+        "deepspeed": "DeepSpeed ZeRO",
+        "megatron_lm": "Megatron-LM",
+        "fsdp": "PyTorch FSDP (Fully Sharded Data Parallel)",
+        "megatron_deepspeed": "Megatron-LM + DeepSpeed",
+    }
+    return names.get(engine_type, engine_type)
+
+
+def _get_optimizer_formula(optimizer: str, num_params: int) -> dict[str, str]:
+    """Get optimizer memory formula based on optimizer type.
+
+    Args:
+        optimizer: Optimizer type (adam, adamw, sgd, adamw_8bit)
+        num_params: Number of model parameters
+
+    Returns:
+        Dictionary with 'formula' and 'description' keys
+    """
+    num_params_formatted = f"{num_params:,}"
+
+    if optimizer in ["adam", "adamw"]:
+        return {
+            "formula": f"{num_params_formatted} × 12 bytes (Adam/AdamW FP32)",
+            "description": "4 bytes FP32 params + 4 bytes momentum + 4 bytes variance",
+        }
+    elif optimizer == "adamw_8bit":
+        return {
+            "formula": f"{num_params_formatted} × 2 bytes (AdamW 8-bit)",
+            "description": "8-bit quantized optimizer states (2 bytes per parameter)",
+        }
+    elif optimizer == "sgd":
+        return {
+            "formula": f"{num_params_formatted} × 4 bytes (SGD)",
+            "description": "4 bytes FP32 params (no momentum for SGD)",
+        }
+    else:
+        # Default to AdamW
+        return {
+            "formula": f"{num_params_formatted} × 12 bytes (Adam/AdamW FP32)",
+            "description": "4 bytes FP32 params + 4 bytes momentum + 4 bytes variance",
+        }
+
+
+def _get_formula_references(engine_type: str) -> list[dict[str, str]]:
+    """Get authoritative references for the formula."""
+    references = [
+        {
+            "title": "EleutherAI Transformer Math 101",
+            "url": "https://blog.eleuther.ai/transformer-math/",
+            "description": "Comprehensive transformer memory breakdown with formulas",
+        },
+        {
+            "title": "Microsoft Research ZeRO Blog",
+            "url": "https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/",
+            "description": "ZeRO optimization techniques and memory formulas",
+        },
+    ]
+
+    if engine_type in ["deepspeed", "megatron_deepspeed"]:
+        references.append(
+            {
+                "title": "DeepSpeed Memory Documentation",
+                "url": "https://deepspeed.readthedocs.io/en/latest/memory.html",
+                "description": "Official DeepSpeed memory requirements and formulas",
+            }
+        )
+    elif engine_type == "megatron_lm" or engine_type == "megatron_deepspeed":
+        references.append(
+            {
+                "title": "NVIDIA Megatron-LM",
+                "url": "https://github.com/NVIDIA/Megatron-LM",
+                "description": "Megatron-LM tensor and pipeline parallelism",
+            }
+        )
+    elif engine_type == "fsdp":
+        references.append(
+            {
+                "title": "PyTorch FSDP Documentation",
+                "url": "https://pytorch.org/docs/stable/fsdp.html",
+                "description": "PyTorch Fully Sharded Data Parallel documentation",
+            }
+        )
+
+    return references
+
+
+@app.post("/api/inference/calculate")
+async def calculate_inference_memory(request: dict[str, Any]) -> dict[str, Any]:
+    """Calculate GPU memory requirements for inference.
+
+    Args:
+        request: Dictionary with model, inference, and hardware configs
+
+    Returns:
+        Inference memory result with breakdown
+    """
+    try:
+        model_data = request.get("model", {})
+        inference_data = request.get("inference", {})
+        hardware_data = request.get("hardware", {})
+
+        # Parse num_parameters if it's a string
+        if "num_parameters" in model_data and isinstance(model_data["num_parameters"], str):
+            from gpu_mem_calculator.config.parser import ConfigParser
+
+            model_data["num_parameters"] = ConfigParser._parse_num_params(
+                model_data["num_parameters"]
+            )
+
+        # Create model config
+        model_config = ModelConfig(**model_data)
+
+        # Create inference config
+        kv_cache_quantization = inference_data.get("kv_cache_quantization", "none")
+        if isinstance(kv_cache_quantization, str):
+            from gpu_mem_calculator.core.models import KVCacheQuantization
+
+            kv_cache_quantization = KVCacheQuantization(kv_cache_quantization)
+
+        inference_config = InferenceConfig(
+            batch_size=inference_data.get("batch_size", 1),
+            kv_cache_quantization=kv_cache_quantization,
+            use_kv_cache=inference_data.get("use_kv_cache", True),
+            tensor_parallel_size=inference_data.get("tensor_parallel_size", 1),
+            gpu_memory_utilization=inference_data.get("gpu_memory_utilization", 0.9),
+            enable_streaming=inference_data.get("enable_streaming", False),
+            # TGI-specific parameters
+            max_total_tokens=inference_data.get("max_total_tokens"),
+            max_input_tokens=inference_data.get("max_input_tokens"),
+            max_batch_total_tokens=inference_data.get("max_batch_total_tokens"),
+            tgi_quantize=inference_data.get("tgi_quantize", "none"),
+            tgi_dtype=inference_data.get("tgi_dtype", "bfloat16"),
+            sharded=inference_data.get("sharded", False),
+            num_shard=inference_data.get("num_shard"),
+            # vLLM-specific parameters
+            block_size=inference_data.get("block_size"),
+            swap_space_gb=inference_data.get("swap_space_gb", 0.0),
+            enable_prefix_caching=inference_data.get("enable_prefix_caching", False),
+            enforce_eager=inference_data.get("enforce_eager", False),
+            max_num_batched_tokens=inference_data.get("max_num_batched_tokens"),
+            max_num_seqs=inference_data.get("max_num_seqs"),
+            vllm_quantization=inference_data.get("vllm_quantization", "none"),
+            # TensorRT-LLM-specific parameters
+            trt_max_batch_size=inference_data.get("trt_max_batch_size"),
+            trt_max_input_len=inference_data.get("trt_max_input_len"),
+            trt_max_seq_len=inference_data.get("trt_max_seq_len"),
+            trt_max_beam_width=inference_data.get("trt_max_beam_width"),
+            # SGLang-specific parameters
+            chunk_size=inference_data.get("chunk_size"),
+            max_running_requests=inference_data.get("max_running_requests"),
+            disable_radix_cache=inference_data.get("disable_radix_cache", False),
+            enable_p2p=inference_data.get("enable_p2p", False),
+            disable_custom_all_reduce=inference_data.get("disable_custom_all_reduce", False),
+            attention_backend=inference_data.get("attention_backend", "flashinfer"),
+            enable_torch_compile=inference_data.get("enable_torch_compile", False),
+            radix_cache_max_seq_len=inference_data.get("radix_cache_max_seq_len"),
+            speculative_algo=inference_data.get("speculative_algo", "default"),
+            multi_lora_enabled=inference_data.get("multi_lora_enabled", False),
+        )
+
+        # Create GPU config
+        gpu_config = GPUConfig(
+            num_gpus=hardware_data.get("num_gpus", 1),
+            gpu_memory_gb=hardware_data.get("gpu_memory_gb", 80),
+        )
+
+        # Get engine type
+        engine_type_str = inference_data.get("engine_type", "huggingface")
+        engine_type_map = {
+            "huggingface": InferenceEngineType.HUGGINGFACE,
+            "vllm": InferenceEngineType.VLLM,
+            "tgi": InferenceEngineType.TGI,
+            "tensorrt_llm": InferenceEngineType.TENSORRT_LLM,
+            "sglang": InferenceEngineType.SGLANG,
+        }
+        engine_type = engine_type_map.get(engine_type_str, InferenceEngineType.HUGGINGFACE)
+
+        # Calculate inference memory
+        calculator = InferenceMemoryCalculator(model_config, inference_config, gpu_config)
+        result = calculator.calculate(engine_type)
+
+        return {
+            "total_memory_per_gpu_gb": result.total_memory_per_gpu_gb,
+            "total_memory_all_gpus_gb": result.total_memory_all_gpus_gb,
+            "breakdown": {
+                "model_params_gb": result.breakdown.model_params_gb,
+                "kv_cache_gb": result.breakdown.kv_cache_gb,
+                "activations_gb": result.breakdown.activations_gb,
+                "overhead_gb": result.breakdown.overhead_gb,
+            },
+            "max_supported_batch_size": result.max_supported_batch_size,
+            "estimated_throughput_tokens_per_sec": result.estimated_throughput_tokens_per_sec,
+            "fits_on_gpu": result.fits_on_gpu,
+            "memory_utilization_percent": result.memory_utilization_percent,
+        }
+
+    except Exception as e:
+        logger.error(f"Inference calculation error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to calculate inference memory: {str(e)}"
+        ) from e
+
+
+@app.post("/api/multinode/calculate")
+async def calculate_multinode(request: dict[str, Any]) -> dict[str, Any]:
+    """Calculate network overhead for multi-node training.
+
+    Args:
+        request: Dictionary with model, training, parallelism, engine, and node configs
+
+    Returns:
+        Network overhead result with suggestions
+    """
+    try:
+        model_data = request.get("model", {})
+        training_data = request.get("training", {})
+        parallelism_data = request.get("parallelism", {})
+        engine_data = request.get("engine", {})
+        node_data = request.get("node_config", {})
+
+        # Parse num_parameters if it's a string
+        if "num_parameters" in model_data and isinstance(model_data["num_parameters"], str):
+            from gpu_mem_calculator.config.parser import ConfigParser
+
+            model_data["num_parameters"] = ConfigParser._parse_num_params(
+                model_data["num_parameters"]
+            )
+
+        # Create minimal configs for multi-node calculation
+        model_config = ModelConfig(
+            name="multinode-model",
+            num_parameters=model_data.get("num_parameters", 7_000_000_000),
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+        )
+
+        training_config = TrainingConfig(
+            dtype=training_data.get("dtype", "bf16"),
+            batch_size=training_data.get("batch_size", 4),
+        )
+
+        parallelism_config = ParallelismConfig(
+            tensor_parallel_size=parallelism_data.get("tensor_parallel_size", 1),
+            pipeline_parallel_size=parallelism_data.get("pipeline_parallel_size", 1),
+            sequence_parallel=parallelism_data.get("sequence_parallel", False),
+        )
+
+        engine_config = EngineConfig(
+            type=engine_data.get("type", "deepspeed"),
+            zero_stage=engine_data.get("zero_stage", 3),
+        )
+
+        interconnect_type_str = node_data.get("interconnect_type", "infiniband")
+        interconnect_map = {
+            "infiniband": InterconnectType.INFINIBAND,
+            "nvlink": InterconnectType.NVLINK,
+            "ethernet_200g": InterconnectType.ETHERNET_200G,
+            "ethernet_100g": InterconnectType.ETHERNET_100G,
+            "ethernet_25g": InterconnectType.ETHERNET_25G,
+            "ethernet_10g": InterconnectType.ETHERNET_10G,
+        }
+        interconnect_type = interconnect_map.get(interconnect_type_str, InterconnectType.INFINIBAND)
+
+        node_config = NodeConfig(
+            num_nodes=node_data.get("num_nodes", 2),
+            gpus_per_node=node_data.get("gpus_per_node", 8),
+            interconnect_type=interconnect_type,
+        )
+
+        # Calculate network overhead
+        calculator = MultiNodeCalculator(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            node_config=node_config,
+            engine_config=engine_config,
+        )
+
+        overhead = calculator.calculate_network_overhead()
+
+        # Generate optimization suggestions
+        suggestions: list[str] = []
+        if overhead.total_overhead_gb > 10:
+            suggestions.append("Consider reducing tensor parallelism to lower AllGather overhead")
+        if overhead.estimated_overhead_ms_per_step and overhead.estimated_overhead_ms_per_step > 50:
+            overhead_val = overhead.estimated_overhead_ms_per_step
+            suggestions.append(
+                f"High communication overhead ({overhead_val:.1f}ms/step). "
+                "Consider upgrading interconnect or reducing model size."
+            )
+        if interconnect_type_str.startswith("ethernet") and node_config.num_nodes > 2:
+            suggestions.append(
+                "Ethernet interconnect detected. For multi-node training, "
+                "consider InfiniBand for better performance."
+            )
+
+        return {
+            "network_overhead": {
+                "total_overhead_gb": overhead.total_overhead_gb,
+                "allreduce_gb": overhead.allreduce_gb,
+                "allgather_gb": overhead.allgather_gb,
+                "reducescatter_gb": overhead.reducescatter_gb,
+                "pipeline_gb": overhead.point_to_point_gb,
+                "estimated_overhead_ms_per_step": overhead.estimated_overhead_ms_per_step,
+                "communication_time_ms_per_step": None,
+                "latency_overhead_ms": None,
+            },
+            "suggestions": suggestions,
+        }
+
+    except Exception as e:
+        logger.error(f"Multi-node calculation error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to calculate multi-node overhead: {str(e)}"
+        ) from e
+
+
+@app.post("/api/export/{format}")
+async def export_framework_config(format: str, request: CalculateRequest) -> dict[str, Any]:
+    """Export configuration to framework-specific format.
+
+    Args:
+        format: Export format (accelerate, lightning, axolotl, deepspeed, yaml, json)
+        request: Calculation request with all configurations
+
+    Returns:
+        Exported configuration file content
+    """
+    try:
+        # Parse configurations
+        model_data = request.model.copy()
+        if "num_parameters" in model_data and isinstance(model_data["num_parameters"], str):
+            from gpu_mem_calculator.config.parser import ConfigParser
+
+            model_data["num_parameters"] = ConfigParser._parse_num_params(
+                model_data["num_parameters"]
+            )
+
+        model_config = ModelConfig(**model_data)
+        training_config = TrainingConfig(**request.training)
+        parallelism_config = (
+            ParallelismConfig(**request.parallelism) if request.parallelism else ParallelismConfig()
+        )
+        engine_config = EngineConfig(**request.engine) if request.engine else EngineConfig()
+
+        # Create minimal node config (not used for single-node export)
+        node_config = NodeConfig(num_nodes=1, gpus_per_node=8)
+
+        # Map format string to ExportFormat enum
+        format_map = {
+            "accelerate": ExportFormat.ACCELERATE,
+            "lightning": ExportFormat.LIGHTNING,
+            "axolotl": ExportFormat.AXOLOTL,
+            "deepspeed": ExportFormat.DEEPSPEED,
+            "yaml": ExportFormat.YAML,
+            "json": ExportFormat.JSON,
+        }
+
+        export_format = format_map.get(format.lower())
+        if not export_format:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported export format: {format}. Supported: {list(format_map.keys())}",
+            )
+
+        # Export configuration
+        manager = ExportManager(
+            model_config=model_config,
+            training_config=training_config,
+            parallelism_config=parallelism_config,
+            engine_config=engine_config,
+            node_config=node_config,
+        )
+
+        result = manager.export(export_format)
+
+        # Generate filename
+        if isinstance(result, dict):
+            filename = f"config_{format}.{result.get('extension', 'txt')}"
+        else:
+            filename = f"config.{format}"
+
+        return {
+            "format": format,
+            "content": result,
+            "filename": filename,
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Export error ({format}): {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to export {format} config: {str(e)}"
+        ) from e
+
+
+def main() -> None:
+    """Run the development server."""
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/web/presets/models.json b/web/presets/models.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d56fb187bb3aa73fb5be52f63eceeb88f2995a
--- /dev/null
+++ b/web/presets/models.json
@@ -0,0 +1,407 @@
+{
+  "llama2-7b": {
+    "display_name": "LLaMA 2 7B",
+    "description": "Meta LLaMA 2 7B model",
+    "config": {
+      "model": {
+        "name": "llama2-7b",
+        "num_parameters": "7B",
+        "num_layers": 32,
+        "hidden_size": 4096,
+        "num_attention_heads": 32,
+        "vocab_size": 32000,
+        "max_seq_len": 4096
+      },
+      "training": {
+        "batch_size": 4,
+        "gradient_accumulation_steps": 4,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 1
+      },
+      "parallelism": {
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "data_parallel_size": 8,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 8,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "llama2-13b": {
+    "display_name": "LLaMA 2 13B",
+    "description": "Meta LLaMA 2 13B model",
+    "config": {
+      "model": {
+        "name": "llama2-13b",
+        "num_parameters": "13B",
+        "num_layers": 40,
+        "hidden_size": 5120,
+        "num_attention_heads": 40,
+        "vocab_size": 32000,
+        "max_seq_len": 4096
+      },
+      "training": {
+        "batch_size": 2,
+        "gradient_accumulation_steps": 8,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 1
+      },
+      "parallelism": {
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "data_parallel_size": 8,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 8,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "llama2-70b": {
+    "display_name": "LLaMA 2 70B",
+    "description": "Meta LLaMA 2 70B model",
+    "config": {
+      "model": {
+        "name": "llama2-70b",
+        "num_parameters": "70B",
+        "num_layers": 80,
+        "hidden_size": 8192,
+        "num_attention_heads": 64,
+        "vocab_size": 32000,
+        "max_seq_len": 4096
+      },
+      "training": {
+        "batch_size": 1,
+        "gradient_accumulation_steps": 16,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 4,
+        "pipeline_parallel_size": 2,
+        "data_parallel_size": 8,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 64,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "gpt3-175b": {
+    "display_name": "GPT-3 175B",
+    "description": "OpenAI GPT-3 175B model",
+    "config": {
+      "model": {
+        "name": "gpt3-175b",
+        "num_parameters": "175B",
+        "num_layers": 96,
+        "hidden_size": 12288,
+        "num_attention_heads": 96,
+        "vocab_size": 50257,
+        "max_seq_len": 2048
+      },
+      "training": {
+        "batch_size": 1,
+        "gradient_accumulation_steps": 1,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 16,
+        "data_parallel_size": 1,
+        "sequence_parallel": true
+      },
+      "engine": {
+        "type": "megatron_lm"
+      },
+      "hardware": {
+        "num_gpus": 1024,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "mixtral-8x7b": {
+    "display_name": "Mixtral 8x7B (MoE)",
+    "description": "Mistral AI Mixtral 8x7B - 46.7B total params, ~12.9B active per token",
+    "config": {
+      "model": {
+        "name": "mixtral-8x7b",
+        "num_parameters": "46.7B",
+        "num_layers": 32,
+        "hidden_size": 4096,
+        "num_attention_heads": 32,
+        "vocab_size": 32000,
+        "max_seq_len": 32768,
+        "moe_enabled": true,
+        "num_experts": 8,
+        "top_k": 2,
+        "expert_intermediate_size": 14336
+      },
+      "training": {
+        "batch_size": 2,
+        "gradient_accumulation_steps": 4,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 2,
+        "pipeline_parallel_size": 1,
+        "data_parallel_size": 4,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 8,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "glm-4-9b": {
+    "display_name": "GLM-4 9B (MoE)",
+    "description": "Tsinghua University GLM-4 9B with MoE architecture",
+    "config": {
+      "model": {
+        "name": "glm-4-9b",
+        "num_parameters": "9B",
+        "num_layers": 40,
+        "hidden_size": 4096,
+        "num_attention_heads": 32,
+        "vocab_size": 151552,
+        "max_seq_len": 8192,
+        "moe_enabled": true,
+        "num_experts": 4,
+        "top_k": 2,
+        "expert_intermediate_size": 10240,
+        "shared_expert_intermediate_size": 10240
+      },
+      "training": {
+        "batch_size": 4,
+        "gradient_accumulation_steps": 4,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "data_parallel_size": 4,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 2,
+        "offload_optimizer": "none",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 4,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "glm-4.7-355b": {
+    "display_name": "GLM-4.7 355B (MoE)",
+    "description": "Tsinghua University GLM-4.7 - Latest flagship with 355B total / 32B active params",
+    "config": {
+      "model": {
+        "name": "glm-4.7-355b",
+        "num_parameters": "355B",
+        "num_layers": 46,
+        "hidden_size": 4096,
+        "num_attention_heads": 96,
+        "vocab_size": 151552,
+        "max_seq_len": 131072,
+        "moe_enabled": true,
+        "num_experts": 128,
+        "top_k": 8,
+        "expert_intermediate_size": 1408,
+        "shared_expert_intermediate_size": 10944
+      },
+      "training": {
+        "batch_size": 1,
+        "gradient_accumulation_steps": 16,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 4
+      },
+      "parallelism": {
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 4,
+        "data_parallel_size": 16,
+        "sequence_parallel": true
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "cpu"
+      },
+      "hardware": {
+        "num_gpus": 512,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "glm-4.5-air-106b": {
+    "display_name": "GLM-4.5 Air 106B (MoE) ⭐ Air",
+    "description": "Tsinghua University GLM-4.5 Air - 106B total / 12B active params, optimized for deployment",
+    "config": {
+      "model": {
+        "name": "glm-4.5-air-106b",
+        "num_parameters": "106B",
+        "num_layers": 46,
+        "hidden_size": 4096,
+        "num_attention_heads": 96,
+        "vocab_size": 151552,
+        "max_seq_len": 131072,
+        "moe_enabled": true,
+        "num_experts": 128,
+        "top_k": 8,
+        "expert_intermediate_size": 1408,
+        "shared_expert_intermediate_size": 10944
+      },
+      "training": {
+        "batch_size": 2,
+        "gradient_accumulation_steps": 8,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 4,
+        "pipeline_parallel_size": 2,
+        "data_parallel_size": 8,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 64,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "qwen1.5-moe-a2.7b": {
+    "display_name": "Qwen1.5-MoE-A2.7B",
+    "description": "Alibaba Qwen1.5 MoE - 14B total params, 2.7B active per token",
+    "config": {
+      "model": {
+        "name": "qwen1.5-moe-a2.7b",
+        "num_parameters": "14B",
+        "num_layers": 28,
+        "hidden_size": 5120,
+        "num_attention_heads": 40,
+        "vocab_size": 151936,
+        "max_seq_len": 32768,
+        "moe_enabled": true,
+        "num_experts": 8,
+        "top_k": 4,
+        "expert_intermediate_size": 15360
+      },
+      "training": {
+        "batch_size": 2,
+        "gradient_accumulation_steps": 4,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 2,
+        "pipeline_parallel_size": 1,
+        "data_parallel_size": 4,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 3,
+        "offload_optimizer": "cpu",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 8,
+        "gpu_memory_gb": 80
+      }
+    }
+  },
+  "deepseek-moe-16b": {
+    "display_name": "DeepSeek-MoE 16B",
+    "description": "DeepSeek MoE model with 16.4B total params, ~2.7B active per token",
+    "config": {
+      "model": {
+        "name": "deepseek-moe-16b",
+        "num_parameters": "16.4B",
+        "num_layers": 28,
+        "hidden_size": 2048,
+        "num_attention_heads": 16,
+        "vocab_size": 102400,
+        "max_seq_len": 4096,
+        "moe_enabled": true,
+        "num_experts": 64,
+        "top_k": 6,
+        "expert_intermediate_size": 1408,
+        "shared_expert_intermediate_size": 10944
+      },
+      "training": {
+        "batch_size": 4,
+        "gradient_accumulation_steps": 4,
+        "optimizer": "adamw",
+        "dtype": "bf16",
+        "activation_checkpointing": 2
+      },
+      "parallelism": {
+        "tensor_parallel_size": 2,
+        "pipeline_parallel_size": 1,
+        "data_parallel_size": 4,
+        "sequence_parallel": false
+      },
+      "engine": {
+        "type": "deepspeed",
+        "zero_stage": 2,
+        "offload_optimizer": "none",
+        "offload_param": "none"
+      },
+      "hardware": {
+        "num_gpus": 8,
+        "gpu_memory_gb": 80
+      }
+    }
+  }
+}
diff --git a/web/static/css/styles.css b/web/static/css/styles.css
new file mode 100644
index 0000000000000000000000000000000000000000..40fae68f04ffc72a61c0c38647e6c3e67e5d12d8
--- /dev/null
+++ b/web/static/css/styles.css
@@ -0,0 +1,532 @@
+/* GPU Memory Calculator Styles */
+
+:root {
+    --primary-color: #2563eb;
+    --primary-hover: #1d4ed8;
+    --success-color: #10b981;
+    --warning-color: #f59e0b;
+    --danger-color: #ef4444;
+    --bg-color: #f8fafc;
+    --card-bg: #ffffff;
+    --border-color: #e2e8f0;
+    --text-primary: #1e293b;
+    --text-secondary: #64748b;
+    --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1);
+}
+
+* {
+    box-sizing: border-box;
+    margin: 0;
+    padding: 0;
+}
+
+body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+    background-color: var(--bg-color);
+    color: var(--text-primary);
+    line-height: 1.6;
+}
+
+.container {
+    max-width: 1400px;
+    margin: 0 auto;
+    padding: 20px;
+}
+
+header {
+    text-align: center;
+    margin-bottom: 30px;
+}
+
+header h1 {
+    font-size: 2.5rem;
+    color: var(--text-primary);
+    margin-bottom: 5px;
+}
+
+.subtitle {
+    color: var(--text-secondary);
+    font-size: 1.1rem;
+}
+
+/* Tab Navigation */
+.tab-navigation {
+    display: flex;
+    gap: 10px;
+    margin-bottom: 30px;
+    justify-content: center;
+    background: var(--card-bg);
+    padding: 10px;
+    border-radius: 8px;
+    box-shadow: var(--shadow);
+}
+
+.tab-btn {
+    padding: 12px 24px;
+    border: 2px solid var(--border-color);
+    background: var(--card-bg);
+    color: var(--text-secondary);
+    border-radius: 6px;
+    cursor: pointer;
+    font-size: 1rem;
+    font-weight: 500;
+    transition: all 0.2s ease;
+    flex: 1;
+    max-width: 200px;
+}
+
+.tab-btn:hover {
+    background: var(--bg-color);
+    border-color: var(--primary-color);
+    color: var(--primary-color);
+}
+
+.tab-btn.active {
+    background: var(--primary-color);
+    color: white;
+    border-color: var(--primary-color);
+}
+
+.tab-content {
+    display: none;
+}
+
+.tab-content.active {
+    display: grid;
+    grid-template-columns: 1fr 400px;
+    gap: 20px;
+    align-items: start;
+}
+
+.main-content {
+    display: contents;
+}
+
+@media (max-width: 1024px) {
+    .tab-content.active {
+        grid-template-columns: 1fr;
+    }
+}
+
+/* Config Panel */
+.config-panel {
+    background: var(--card-bg);
+    border-radius: 8px;
+    padding: 20px;
+    box-shadow: var(--shadow);
+}
+
+.config-section {
+    margin-bottom: 25px;
+    padding-bottom: 20px;
+    border-bottom: 1px solid var(--border-color);
+}
+
+.config-section:last-of-type {
+    border-bottom: none;
+}
+
+.config-section h3 {
+    font-size: 1.2rem;
+    margin-bottom: 15px;
+    color: var(--text-primary);
+}
+
+.form-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 15px;
+}
+
+.form-group {
+    display: flex;
+    flex-direction: column;
+    position: relative;
+}
+
+.form-group label {
+    font-size: 0.9rem;
+    font-weight: 500;
+    margin-bottom: 5px;
+    color: var(--text-primary);
+}
+
+.form-group input[type="text"],
+.form-group input[type="number"],
+.form-group select {
+    padding: 8px 12px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+    font-size: 0.95rem;
+    transition: border-color 0.2s;
+}
+
+.form-group input[type="text"]:focus,
+.form-group input[type="number"]:focus,
+.form-group select:focus {
+    outline: none;
+    border-color: var(--primary-color);
+}
+
+.form-group input[type="range"] {
+    margin-top: 5px;
+    width: 100%;
+}
+
+.form-group input[type="checkbox"] {
+    margin-right: 8px;
+}
+
+.info-text {
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    margin-top: 10px;
+}
+
+/* Buttons */
+.button-group {
+    display: flex;
+    gap: 10px;
+    margin-top: 20px;
+}
+
+.btn-primary {
+    background-color: var(--primary-color);
+    color: white;
+    border: none;
+    padding: 12px 24px;
+    border-radius: 6px;
+    font-size: 1rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: background-color 0.2s;
+}
+
+.btn-primary:hover {
+    background-color: var(--primary-hover);
+}
+
+.btn-secondary {
+    background-color: white;
+    color: var(--text-primary);
+    border: 1px solid var(--border-color);
+    padding: 12px 24px;
+    border-radius: 6px;
+    font-size: 1rem;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.2s;
+}
+
+.btn-secondary:hover {
+    background-color: var(--bg-color);
+    border-color: var(--text-secondary);
+}
+
+/* Results Panel */
+.results-panel {
+    background: var(--card-bg);
+    border-radius: 8px;
+    padding: 20px;
+    box-shadow: var(--shadow);
+    position: sticky;
+    top: 20px;
+}
+
+.result-card {
+    margin-bottom: 20px;
+    padding-bottom: 15px;
+    border-bottom: 1px solid var(--border-color);
+}
+
+.result-card:last-child {
+    border-bottom: none;
+}
+
+.result-card h3 {
+    font-size: 1.1rem;
+    margin-bottom: 12px;
+    color: var(--text-primary);
+}
+
+.metric {
+    display: flex;
+    justify-content: space-between;
+    margin-bottom: 8px;
+}
+
+.metric-label {
+    color: var(--text-secondary);
+    font-size: 0.95rem;
+}
+
+.metric-value {
+    font-weight: 600;
+    color: var(--text-primary);
+    font-size: 1rem;
+}
+
+.breakdown-item {
+    display: flex;
+    justify-content: space-between;
+    margin-bottom: 6px;
+    font-size: 0.9rem;
+}
+
+.breakdown-label {
+    color: var(--text-secondary);
+}
+
+.breakdown-value {
+    font-weight: 500;
+    color: var(--text-primary);
+}
+
+/* Bar Chart */
+.bar-chart {
+    display: flex;
+    height: 24px;
+    border-radius: 4px;
+    overflow: hidden;
+    margin-top: 15px;
+    background-color: var(--border-color);
+}
+
+.bar {
+    height: 100%;
+    transition: width 0.3s ease;
+    position: relative;
+}
+
+/* Add patterns to bars for colorblind accessibility */
+#bar-params {
+    background-color: #3b82f6;
+    background-image: repeating-linear-gradient(
+        45deg,
+        transparent,
+        transparent 5px,
+        rgba(255, 255, 255, 0.1) 5px,
+        rgba(255, 255, 255, 0.1) 10px
+    );
+}
+
+#bar-grads {
+    background-color: #8b5cf6;
+    background-image: repeating-linear-gradient(
+        45deg,
+        transparent,
+        transparent 5px,
+        rgba(255, 255, 255, 0.1) 5px,
+        rgba(255, 255, 255, 0.1) 10px
+    );
+    /* Different pattern: dots */
+    background-image: radial-gradient(circle, rgba(255,255,255,0.2) 1px, transparent 1px);
+    background-size: 8px 8px;
+}
+
+#bar-optimizer {
+    background-color: #ec4899;
+    background-image: repeating-linear-gradient(
+        -45deg,
+        transparent,
+        transparent 5px,
+        rgba(255, 255, 255, 0.15) 5px,
+        rgba(255, 255, 255, 0.15) 10px
+    );
+}
+
+#bar-activations {
+    background-color: #10b981;
+    background-image: repeating-linear-gradient(
+        90deg,
+        transparent,
+        transparent 5px,
+        rgba(255, 255, 255, 0.1) 5px,
+        rgba(255, 255, 255, 0.1) 10px
+    );
+}
+
+.bar:first-child {
+    border-top-left-radius: 4px;
+    border-bottom-left-radius: 4px;
+}
+
+.bar:last-child {
+    border-top-right-radius: 4px;
+    border-bottom-right-radius: 4px;
+}
+
+.chart-legend {
+    display: flex;
+    gap: 15px;
+    margin-top: 10px;
+    font-size: 0.8rem;
+}
+
+.legend-item {
+    display: flex;
+    align-items: center;
+    gap: 5px;
+}
+
+.legend-color {
+    width: 12px;
+    height: 12px;
+    border-radius: 2px;
+}
+
+.legend-color.params { background-color: #3b82f6; }
+.legend-color.grads { background-color: #8b5cf6; }
+.legend-color.optimizer { background-color: #ec4899; }
+.legend-color.activations { background-color: #10b981; }
+
+#bar-params { background-color: #3b82f6; }
+#bar-grads { background-color: #8b5cf6; }
+#bar-optimizer { background-color: #ec4899; }
+#bar-activations { background-color: #10b981; }
+
+/* Screen reader only class */
+.sr-only {
+    position: absolute;
+    width: 1px;
+    height: 1px;
+    padding: 0;
+    margin: -1px;
+    overflow: hidden;
+    clip: rect(0, 0, 0, 0);
+    white-space: nowrap;
+    border-width: 0;
+}
+
+/* Status colors */
+.status-success { color: var(--success-color); }
+.status-warning { color: var(--warning-color); }
+.status-danger { color: var(--danger-color); }
+
+/* Group Label */
+.group-label {
+    font-size: 0.95rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 5px;
+    display: block;
+}
+
+/* Error Message */
+.error-message {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    background-color: var(--danger-color);
+    color: white;
+    padding: 15px 20px;
+    border-radius: 6px;
+    box-shadow: var(--shadow);
+    z-index: 1000;
+}
+
+/* Tooltip */
+[data-tooltip] {
+    position: relative;
+}
+
+[data-tooltip]:hover::after {
+    content: attr(data-tooltip);
+    position: absolute;
+    bottom: 100%;
+    left: 50%;
+    transform: translateX(-50%);
+    padding: 5px 10px;
+    background-color: var(--text-primary);
+    color: white;
+    font-size: 0.8rem;
+    border-radius: 4px;
+    white-space: nowrap;
+    z-index: 100;
+    margin-bottom: 5px;
+}
+
+/* Formula Explanation Section */
+.formula-description {
+    margin-bottom: 15px;
+    line-height: 1.6;
+}
+
+.formula-description p {
+    margin-bottom: 8px;
+}
+
+.formula-components-list {
+    list-style: none;
+    padding: 0;
+    margin: 15px 0;
+}
+
+.formula-components-list li {
+    background-color: var(--bg-color);
+    border: 1px solid var(--border-color);
+    border-radius: 6px;
+    padding: 12px;
+    margin-bottom: 12px;
+}
+
+.component-name {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 6px;
+    font-size: 1rem;
+}
+
+.component-formula {
+    font-family: 'Courier New', Courier, monospace;
+    background-color: var(--border-color);
+    padding: 8px;
+    border-radius: 4px;
+    margin: 8px 0;
+    font-size: 0.9rem;
+    overflow-x: auto;
+}
+
+.component-calculation {
+    margin: 6px 0;
+    font-size: 0.9rem;
+    color: var(--text-secondary);
+}
+
+.component-result {
+    margin-top: 6px;
+    font-size: 0.95rem;
+    color: var(--primary-color);
+    font-weight: 500;
+}
+
+.formula-references {
+    margin-top: 20px;
+    padding-top: 15px;
+    border-top: 1px solid var(--border-color);
+}
+
+.formula-references h4 {
+    font-size: 1rem;
+    color: var(--text-primary);
+    margin-bottom: 10px;
+}
+
+.formula-references ul {
+    list-style-type: none;
+    padding: 0;
+}
+
+.formula-references li {
+    margin-bottom: 8px;
+}
+
+.formula-references a {
+    color: var(--primary-color);
+    text-decoration: none;
+    font-size: 0.9rem;
+}
+
+.formula-references a:hover {
+    text-decoration: underline;
+}
diff --git a/web/static/js/app.js b/web/static/js/app.js
new file mode 100644
index 0000000000000000000000000000000000000000..eff9e5eaad9bc71a8271daf94079269897b6fc5b
--- /dev/null
+++ b/web/static/js/app.js
@@ -0,0 +1,1404 @@
+// GPU Memory Calculator - Main Application Logic
+
+class GPUMemCalculator {
+    constructor() {
+        this.apiBase = '/api';
+        this.autoCalculateEnabled = true;
+        this.debounceTimer = null;
+        this.debounceDelay = 1000; // ms - increased from 500 to reduce API calls
+        this.isApplyingConfig = false; // Flag to prevent auto-calc during preset loads
+        this.lastCalculationTime = 0; // Prevent too frequent calculations
+        this.minCalculationInterval = 500; // Minimum time between calculations (ms)
+        this.savedConfigs = []; // For comparison mode
+        this.initEventListeners();
+        this.initAutoCalculate();
+        this.initTabListeners();
+        this.loadSavedConfigs();
+    }
+
+    initEventListeners() {
+        // Tab navigation
+        document.querySelectorAll('.tab-btn').forEach(btn => {
+            btn.addEventListener('click', (e) => {
+                const tabName = e.target.dataset.tab;
+                this.switchTab(tabName);
+            });
+        });
+
+        // Preset selection
+        document.getElementById('preset-select').addEventListener('change', (e) => {
+            if (e.target.value !== 'custom') {
+                this.loadPreset(e.target.value);
+            }
+        });
+
+        // Batch size slider sync
+        const batchSizeInput = document.getElementById('batch-size');
+        const batchSizeSlider = document.getElementById('batch-size-slider');
+
+        batchSizeSlider.addEventListener('input', (e) => {
+            batchSizeInput.value = e.target.value;
+        });
+
+        batchSizeInput.addEventListener('input', (e) => {
+            batchSizeSlider.value = e.target.value;
+        });
+
+        // GPU memory dropdown
+        document.getElementById('gpu-model').addEventListener('change', (e) => {
+            const customInput = document.getElementById('gpu-mem-custom');
+            if (e.target.value === 'custom') {
+                customInput.style.display = 'block';
+            } else {
+                customInput.style.display = 'none';
+                customInput.value = e.target.value;
+            }
+        });
+
+        // Engine type change - update dynamic fields
+        document.getElementById('engine-type').addEventListener('change', (e) => {
+            this.updateEngineFields(e.target.value);
+        });
+
+        // Parallelism change - update effective GPUs
+        const parallelismInputs = ['tensor-pp', 'pipeline-pp', 'data-pp'];
+        parallelismInputs.forEach(id => {
+            document.getElementById(id).addEventListener('input', () => {
+                this.updateEffectiveGPUs();
+            });
+        });
+
+        // MoE checkbox - toggle visibility of MoE fields
+        document.getElementById('moe-enabled').addEventListener('change', (e) => {
+            this.toggleMoEFields(e.target.checked);
+        });
+
+        // MoE field changes - update display
+        ['num-experts', 'top-k'].forEach(id => {
+            document.getElementById(id).addEventListener('input', () => {
+                this.updateMoEDisplay();
+            });
+        });
+
+        // Calculate button
+        document.getElementById('calculate-btn').addEventListener('click', () => {
+            this.calculateMemory();
+        });
+
+        // Reset button
+        document.getElementById('reset-btn').addEventListener('click', () => {
+            this.resetForm();
+        });
+
+        // Save config button
+        document.getElementById('save-config-btn').addEventListener('click', () => {
+            this.saveConfig();
+        });
+
+        // Copy JSON button
+        document.getElementById('copy-json-btn').addEventListener('click', () => {
+            this.copyConfigJSON();
+        });
+
+        // Show formula details button - use toggle approach
+        document.getElementById('show-formula-btn').addEventListener('click', () => {
+            this.toggleFormulaExplanation();
+        });
+
+        // Initialize engine fields
+        this.updateEngineFields('deepspeed');
+        this.updateEffectiveGPUs();
+
+        // Store last config for formula explanation
+        this.lastConfig = null;
+        // Track if formula details are currently visible
+        this.formulaDetailsVisible = false;
+    }
+
+    initAutoCalculate() {
+        // List of all input IDs that should trigger auto-calculation
+        const autoCalcInputs = [
+            // Model settings
+            'model-name', 'num-params', 'num-layers', 'hidden-size', 'num-heads',
+            'vocab-size', 'seq-len',
+            // MoE settings
+            'moe-enabled', 'num-experts', 'top-k', 'expert-intermediate-size', 'shared-expert-size',
+            // Training settings
+            'batch-size', 'batch-size-slider', 'grad-accum', 'optimizer', 'dtype',
+            'activation-checkpointing',
+            // Parallelism
+            'tensor-pp', 'pipeline-pp', 'data-pp', 'seq-parallel',
+            // Engine settings
+            'engine-type', 'zero-stage', 'offload-optimizer', 'offload-param',
+            'zero-init', 'sharding-strategy', 'use-distributed-optimizer',
+            'num-micro-batches', 'gradient-clipping', 'weight-decay', 'lr', 'warmup-steps',
+            // Hardware
+            'num-gpus', 'gpu-model', 'gpu-mem-custom',
+        ];
+
+        // Add event listeners to all inputs
+        autoCalcInputs.forEach(id => {
+            const element = document.getElementById(id);
+            if (!element) return;
+
+            // Use 'change' event for selects and checkboxes
+            // Use 'input' event for text/number inputs
+            const eventType = (element.tagName === 'SELECT' ||
+                              element.tagName === 'INPUT' &&
+                              (element.type === 'checkbox' || element.type === 'range'))
+                              ? 'input' : 'input';
+
+            element.addEventListener(eventType, () => {
+                this.scheduleAutoCalculate();
+            });
+        });
+    }
+
+    scheduleAutoCalculate() {
+        // Don't auto-calculate if currently applying a config (preset load)
+        if (this.isApplyingConfig) return;
+
+        // Don't auto-calculate if disabled
+        if (!this.autoCalculateEnabled) return;
+
+        // Check minimum time between calculations
+        const now = Date.now();
+        if (now - this.lastCalculationTime < this.minCalculationInterval) {
+            return; // Skip this calculation, too soon
+        }
+
+        // Clear existing timer
+        if (this.debounceTimer) {
+            clearTimeout(this.debounceTimer);
+        }
+
+        // Schedule new calculation
+        this.debounceTimer = setTimeout(() => {
+            this.calculateMemory();
+        }, this.debounceDelay);
+    }
+
+    /**
+     * Client-side validation before making API call
+     * Returns {valid: boolean, errors: string[]}
+     */
+    validateForm() {
+        const errors = [];
+
+        // Get form values
+        const tensorPP = parseInt(document.getElementById('tensor-pp').value) || 1;
+        const pipelinePP = parseInt(document.getElementById('pipeline-pp').value) || 1;
+        const dataPP = parseInt(document.getElementById('data-pp').value) || 1;
+        const numGPUs = parseInt(document.getElementById('num-gpus').value) || 1;
+        const seqParallel = document.getElementById('seq-parallel').checked;
+        const engineType = document.getElementById('engine-type').value;
+        const zeroStage = parseInt(document.getElementById('zero-stage').value) || 0;
+        const moeEnabled = document.getElementById('moe-enabled').checked;
+        const numExperts = parseInt(document.getElementById('num-experts').value) || 1;
+        const topK = parseInt(document.getElementById('top-k').value) || 1;
+
+        // Validate parallelism consistency
+        const effectiveGPUs = tensorPP * pipelinePP * dataPP;
+        if (effectiveGPUs !== numGPUs) {
+            errors.push(
+                `Parallelism mismatch: ${tensorPP}×${pipelinePP}×${dataPP}=${effectiveGPUs} GPUs, ` +
+                `but num_gpus=${numGPUs}. These must match.`
+            );
+        }
+
+        // Validate sequence parallel requires tensor parallel > 1
+        if (seqParallel && tensorPP <= 1) {
+            errors.push(
+                'Sequence parallelism requires tensor_parallel_size > 1, ' +
+                `but tensor_pp=${tensorPP}.`
+            );
+        }
+
+        // Validate ZeRO stages only for DeepSpeed engines
+        if (zeroStage > 0 && !['deepspeed', 'megatron_deepspeed'].includes(engineType)) {
+            errors.push(
+                `ZeRO stages are only supported for DeepSpeed engines, ` +
+                `but engine_type='${engineType}' with zero_stage=${zeroStage}.`
+            );
+        }
+
+        // Validate MoE settings
+        if (moeEnabled) {
+            if (topK > numExperts) {
+                errors.push(
+                    `MoE top_k (${topK}) cannot exceed num_experts (${numExperts}).`
+                );
+            }
+            if (numExperts < 1 || numExperts > 256) {
+                errors.push(`num_experts must be between 1 and 256, got ${numExperts}.`);
+            }
+            if (topK < 1 || topK > 8) {
+                errors.push(`top_k must be between 1 and 8, got ${topK}.`);
+            }
+        }
+
+        return {
+            valid: errors.length === 0,
+            errors: errors
+        };
+    }
+
+    /**
+     * Switch between tabs
+     */
+    switchTab(tabName) {
+        // Update tab buttons
+        document.querySelectorAll('.tab-btn').forEach(btn => {
+            btn.classList.remove('active');
+            if (btn.dataset.tab === tabName) {
+                btn.classList.add('active');
+            }
+        });
+
+        // Update tab content
+        document.querySelectorAll('.tab-content').forEach(content => {
+            content.classList.remove('active');
+            content.style.display = 'none';
+        });
+
+        const activeTab = document.getElementById(`${tabName}-tab`);
+        if (activeTab) {
+            activeTab.classList.add('active');
+            activeTab.style.display = 'block';
+        }
+    }
+
+    /**
+     * Initialize tab-specific event listeners
+     */
+    initTabListeners() {
+        // Inference tab event listeners
+        const infCalcBtn = document.getElementById('inference-calculate-btn');
+        const infResetBtn = document.getElementById('inference-reset-btn');
+        const infPresetSelect = document.getElementById('inference-preset-select');
+        if (infCalcBtn) {
+            infCalcBtn.addEventListener('click', () => this.calculateInferenceMemory());
+        }
+        if (infResetBtn) {
+            infResetBtn.addEventListener('click', () => this.resetInferenceForm());
+        }
+        if (infPresetSelect) {
+            infPresetSelect.addEventListener('change', (e) => {
+                if (e.target.value !== 'custom') {
+                    this.loadInferencePreset(e.target.value);
+                }
+            });
+        }
+
+        // GPU memory utilization slider
+        const gpuMemUtilSlider = document.getElementById('gpu-memory-util');
+        const gpuMemUtilValue = document.getElementById('gpu-memory-util-value');
+        if (gpuMemUtilSlider && gpuMemUtilValue) {
+            gpuMemUtilSlider.addEventListener('input', (e) => {
+                gpuMemUtilValue.textContent = parseFloat(e.target.value).toFixed(2);
+            });
+        }
+
+        // Inference engine dropdown - show/hide engine-specific sections
+        const infEngineSelect = document.getElementById('inference-engine');
+        if (infEngineSelect) {
+            infEngineSelect.addEventListener('change', (e) => {
+                this.updateInferenceEngineFields(e.target.value);
+            });
+            // Initialize with default engine
+            this.updateInferenceEngineFields(infEngineSelect.value);
+        }
+
+        // Multi-node tab event listeners
+        const multiCalcBtn = document.getElementById('multinode-calculate-btn');
+        const multiResetBtn = document.getElementById('multinode-reset-btn');
+        const multiPresetSelect = document.getElementById('multinode-preset-select');
+        if (multiCalcBtn) {
+            multiCalcBtn.addEventListener('click', () => this.calculateMultiNode());
+        }
+        if (multiResetBtn) {
+            multiResetBtn.addEventListener('click', () => this.resetMultiNodeForm());
+        }
+        if (multiPresetSelect) {
+            multiPresetSelect.addEventListener('change', (e) => {
+                if (e.target.value !== 'custom') {
+                    this.loadMultiNodePreset(e.target.value);
+                }
+            });
+        }
+
+        // Update total GPUs display
+        const numNodesInput = document.getElementById('num-nodes');
+        const gpusPerNodeInput = document.getElementById('gpus-per-node');
+        const totalGpusSpan = document.getElementById('multinode-total-gpus');
+
+        const updateTotalGpus = () => {
+            if (numNodesInput && gpusPerNodeInput && totalGpusSpan) {
+                const nodes = parseInt(numNodesInput.value) || 1;
+                const gpusPerNode = parseInt(gpusPerNodeInput.value) || 8;
+                totalGpusSpan.textContent = nodes * gpusPerNode;
+            }
+        };
+
+        if (numNodesInput) numNodesInput.addEventListener('input', updateTotalGpus);
+        if (gpusPerNodeInput) gpusPerNodeInput.addEventListener('input', updateTotalGpus);
+
+        // Export framework button
+        const exportBtn = document.getElementById('export-framework-btn');
+        if (exportBtn) {
+            exportBtn.addEventListener('click', () => this.showExportModal());
+        }
+    }
+
+    /**
+     * Load saved configs from localStorage
+     */
+    loadSavedConfigs() {
+        try {
+            const saved = localStorage.getItem('gpu-mem-saved-configs');
+            if (saved) {
+                this.savedConfigs = JSON.parse(saved);
+            }
+        } catch (e) {
+            console.warn('Failed to load saved configs:', e);
+            this.savedConfigs = [];
+        }
+    }
+
+    /**
+     * Save current config for comparison
+     */
+    saveConfigForComparison() {
+        const config = this.collectFormData();
+        const name = config.model.name || 'unnamed';
+
+        // Add timestamp
+        config.savedAt = new Date().toISOString();
+        config.id = Date.now();
+
+        this.savedConfigs.push(config);
+
+        // Limit to 10 saved configs
+        if (this.savedConfigs.length > 10) {
+            this.savedConfigs.shift();
+        }
+
+        // Save to localStorage
+        try {
+            localStorage.setItem('gpu-mem-saved-configs', JSON.stringify(this.savedConfigs));
+            this.showError(`Saved config: ${name}`, true);
+        } catch (e) {
+            this.showError('Failed to save config');
+        }
+    }
+
+    /**
+     * Show comparison modal/panel
+     */
+    showComparison(configId) {
+        const config = this.savedConfigs.find(c => c.id === configId);
+        if (!config) return;
+
+        const currentConfig = this.collectFormData();
+
+        // Create comparison HTML
+        const comparisonHTML = this.generateComparisonHTML(currentConfig, config);
+
+        // Show in modal (you'll need to add modal HTML to index.html)
+        alert('Comparison feature - modal will be added');
+    }
+
+    /**
+     * Generate HTML for comparison view
+     */
+    generateComparisonHTML(config1, config2) {
+        // Calculate memory for both configs
+        // For now, just return placeholder
+        return `
+            <h3>Configuration Comparison</h3>
+            <div class="comparison-container">
+                <div class="config-column">
+                    <h4>Current Config</h4>
+                    <pre>${JSON.stringify(config1, null, 2)}</pre>
+                </div>
+                <div class="config-column">
+                    <h4>Saved Config</h4>
+                    <pre>${JSON.stringify(config2, null, 2)}</pre>
+                </div>
+            </div>
+        `;
+    }
+
+    setAutoCalculate(enabled) {
+        this.autoCalculateEnabled = enabled;
+    }
+
+    async loadPreset(presetName) {
+        try {
+            const response = await fetch(`${this.apiBase}/preset/${presetName}`);
+            if (!response.ok) {
+                throw new Error(`Failed to load preset: ${presetName}`);
+            }
+
+            const config = await response.json();
+            this.applyConfig(config);
+        } catch (error) {
+            this.showError(`Failed to load preset: ${error.message}`);
+        }
+    }
+
+    async loadInferencePreset(presetName) {
+        try {
+            const response = await fetch(`${this.apiBase}/preset/${presetName}`);
+            if (!response.ok) {
+                throw new Error(`Failed to load preset: ${presetName}`);
+            }
+
+            const config = await response.json();
+            this.applyInferenceConfig(config);
+        } catch (error) {
+            this.showError(`Failed to load preset: ${error.message}`);
+        }
+    }
+
+    async loadMultiNodePreset(presetName) {
+        try {
+            const response = await fetch(`${this.apiBase}/preset/${presetName}`);
+            if (!response.ok) {
+                throw new Error(`Failed to load preset: ${presetName}`);
+            }
+
+            const config = await response.json();
+            this.applyMultiNodeConfig(config);
+        } catch (error) {
+            this.showError(`Failed to load preset: ${error.message}`);
+        }
+    }
+
+    applyConfig(config) {
+        // Set flag to prevent auto-calculation during config load
+        this.isApplyingConfig = true;
+
+        // Apply model configuration
+        if (config.model) {
+            if (config.model.name) document.getElementById('model-name').value = config.model.name;
+            if (config.model.num_parameters) document.getElementById('num-params').value = config.model.num_parameters;
+            if (config.model.num_layers) document.getElementById('num-layers').value = config.model.num_layers;
+            if (config.model.hidden_size) document.getElementById('hidden-size').value = config.model.hidden_size;
+            if (config.model.num_attention_heads) document.getElementById('num-heads').value = config.model.num_attention_heads;
+            if (config.model.vocab_size) document.getElementById('vocab-size').value = config.model.vocab_size;
+            if (config.model.max_seq_len) document.getElementById('seq-len').value = config.model.max_seq_len;
+        }
+
+        // Apply MoE configuration
+        if (config.model.moe_enabled !== undefined) {
+            document.getElementById('moe-enabled').checked = config.model.moe_enabled;
+            this.toggleMoEFields(config.model.moe_enabled);
+
+            if (config.model.moe_enabled) {
+                if (config.model.num_experts) {
+                    document.getElementById('num-experts').value = config.model.num_experts;
+                }
+                if (config.model.top_k) {
+                    document.getElementById('top-k').value = config.model.top_k;
+                }
+                if (config.model.expert_intermediate_size) {
+                    document.getElementById('expert-intermediate-size').value = config.model.expert_intermediate_size;
+                }
+                if (config.model.shared_expert_intermediate_size) {
+                    document.getElementById('shared-expert-size').value = config.model.shared_expert_intermediate_size;
+                }
+                this.updateMoEDisplay();
+            }
+        }
+
+        // Apply training configuration
+        if (config.training) {
+            if (config.training.batch_size) {
+                document.getElementById('batch-size').value = config.training.batch_size;
+                document.getElementById('batch-size-slider').value = config.training.batch_size;
+            }
+            if (config.training.gradient_accumulation_steps) {
+                document.getElementById('grad-accum').value = config.training.gradient_accumulation_steps;
+            }
+            if (config.training.optimizer) document.getElementById('optimizer').value = config.training.optimizer;
+            if (config.training.dtype) document.getElementById('dtype').value = config.training.dtype;
+            if (config.training.activation_checkpointing !== undefined) {
+                document.getElementById('activation-checkpointing').value = config.training.activation_checkpointing;
+            }
+        }
+
+        // Apply parallelism configuration
+        if (config.parallelism) {
+            if (config.parallelism.tensor_parallel_size) {
+                document.getElementById('tensor-pp').value = config.parallelism.tensor_parallel_size;
+            }
+            if (config.parallelism.pipeline_parallel_size) {
+                document.getElementById('pipeline-pp').value = config.parallelism.pipeline_parallel_size;
+            }
+            if (config.parallelism.data_parallel_size) {
+                document.getElementById('data-pp').value = config.parallelism.data_parallel_size;
+            }
+            if (config.parallelism.sequence_parallel) {
+                document.getElementById('seq-parallel').checked = config.parallelism.sequence_parallel;
+            }
+        }
+
+        // Apply engine configuration
+        if (config.engine) {
+            if (config.engine.type) {
+                document.getElementById('engine-type').value = config.engine.type;
+                this.updateEngineFields(config.engine.type);
+            }
+            if (config.engine.zero_stage !== undefined) {
+                document.getElementById('zero-stage').value = config.engine.zero_stage;
+            }
+            if (config.engine.offload_optimizer) {
+                document.getElementById('offload-optimizer').value = config.engine.offload_optimizer;
+            }
+            if (config.engine.offload_param) {
+                document.getElementById('offload-param').value = config.engine.offload_param;
+            }
+        }
+
+        // Apply hardware configuration
+        if (config.hardware) {
+            if (config.hardware.num_gpus) document.getElementById('num-gpus').value = config.hardware.num_gpus;
+            if (config.hardware.gpu_memory_gb) {
+                document.getElementById('gpu-model').value = config.hardware.gpu_memory_gb;
+                document.getElementById('gpu-mem-custom').value = config.hardware.gpu_memory_gb;
+            }
+        }
+
+        this.updateEffectiveGPUs();
+
+        // Re-enable auto-calculation and trigger calculation
+        setTimeout(() => {
+            this.isApplyingConfig = false;
+            this.calculateMemory();
+        }, 100);
+    }
+
+    updateEngineFields(engineType) {
+        const zeroStageGroup = document.getElementById('zero-stage-group');
+        const offloadOptGroup = document.getElementById('offload-opt-group');
+        const offloadParamGroup = document.getElementById('offload-param-group');
+        const zeroInitGroup = document.getElementById('zero-init-group');
+        const shardingStrategyGroup = document.getElementById('sharding-strategy-group');
+        const megatronOptions = document.getElementById('megatron-options');
+
+        // Hide all first
+        zeroStageGroup.style.display = 'none';
+        offloadOptGroup.style.display = 'none';
+        offloadParamGroup.style.display = 'none';
+        zeroInitGroup.style.display = 'none';
+        shardingStrategyGroup.style.display = 'none';
+        megatronOptions.style.display = 'none';
+
+        // Show/hide fields based on engine type
+        switch (engineType) {
+            case 'deepspeed':
+            case 'megatron_deepspeed':
+                zeroStageGroup.style.display = 'block';
+                offloadOptGroup.style.display = 'block';
+                offloadParamGroup.style.display = 'block';
+                zeroInitGroup.style.display = 'block';
+                break;
+            case 'pytorch_ddp':
+            case 'megatron_lm':
+                // No special options
+                break;
+            case 'fsdp':
+                shardingStrategyGroup.style.display = 'block';
+                break;
+        }
+
+        // Show Megatron options for Megatron engines
+        if (engineType === 'megatron_lm' || engineType === 'megatron_deepspeed') {
+            megatronOptions.style.display = 'block';
+        }
+    }
+
+    updateEffectiveGPUs() {
+        const tensorPP = parseInt(document.getElementById('tensor-pp').value) || 1;
+        const pipelinePP = parseInt(document.getElementById('pipeline-pp').value) || 1;
+        const dataPP = parseInt(document.getElementById('data-pp').value) || 1;
+
+        const effectiveGPUs = tensorPP * pipelinePP * dataPP;
+        document.getElementById('effective-gpus').textContent = effectiveGPUs;
+    }
+
+    toggleMoEFields(enabled) {
+        const moeFields = document.getElementById('moe-fields');
+        moeFields.style.display = enabled ? 'block' : 'none';
+        if (enabled) {
+            this.updateMoEDisplay();
+        }
+    }
+
+    updateMoEDisplay() {
+        const numExperts = parseInt(document.getElementById('num-experts').value) || 8;
+        const topK = parseInt(document.getElementById('top-k').value) || 2;
+
+        document.getElementById('total-experts-display').textContent = numExperts;
+        document.getElementById('active-experts-display').textContent = topK;
+    }
+
+    updateInferenceEngineFields(engineType) {
+        const tgiSettings = document.getElementById('tgi-settings');
+        const vllmSettings = document.getElementById('vllm-settings');
+        const tensorrtSettings = document.getElementById('tensorrt-settings');
+        const sglangSettings = document.getElementById('sglang-settings');
+
+        // Hide all engine-specific sections first
+        if (tgiSettings) tgiSettings.style.display = 'none';
+        if (vllmSettings) vllmSettings.style.display = 'none';
+        if (tensorrtSettings) tensorrtSettings.style.display = 'none';
+        if (sglangSettings) sglangSettings.style.display = 'none';
+
+        // Show relevant section based on engine type
+        switch (engineType) {
+            case 'tgi':
+                if (tgiSettings) tgiSettings.style.display = 'block';
+                break;
+            case 'vllm':
+                if (vllmSettings) vllmSettings.style.display = 'block';
+                break;
+            case 'tensorrt_llm':
+                if (tensorrtSettings) tensorrtSettings.style.display = 'block';
+                break;
+            case 'sglang':
+                if (sglangSettings) sglangSettings.style.display = 'block';
+                break;
+            case 'huggingface':
+            default:
+                // No additional settings for HuggingFace
+                break;
+        }
+    }
+
+    collectFormData() {
+        // Get GPU memory value
+        let gpuMem = document.getElementById('gpu-model').value;
+        if (gpuMem === 'custom') {
+            gpuMem = parseFloat(document.getElementById('gpu-mem-custom').value);
+        } else {
+            gpuMem = parseFloat(gpuMem);
+        }
+
+        // Get engine type
+        const engineType = document.getElementById('engine-type').value;
+
+        // Get MoE parameters
+        const moeEnabled = document.getElementById('moe-enabled').checked;
+        const expertIntermediateSize = document.getElementById('expert-intermediate-size').value;
+        const sharedExpertSize = document.getElementById('shared-expert-size').value;
+
+        return {
+            model: {
+                name: document.getElementById('model-name').value,
+                num_parameters: document.getElementById('num-params').value,
+                num_layers: parseInt(document.getElementById('num-layers').value),
+                hidden_size: parseInt(document.getElementById('hidden-size').value),
+                num_attention_heads: parseInt(document.getElementById('num-heads').value),
+                vocab_size: parseInt(document.getElementById('vocab-size').value),
+                max_seq_len: parseInt(document.getElementById('seq-len').value),
+                moe_enabled: moeEnabled,
+                num_experts: moeEnabled ? parseInt(document.getElementById('num-experts').value) : 1,
+                top_k: moeEnabled ? parseInt(document.getElementById('top-k').value) : 1,
+                expert_intermediate_size: expertIntermediateSize ? parseInt(expertIntermediateSize) : null,
+                shared_expert_intermediate_size: sharedExpertSize ? parseInt(sharedExpertSize) : null,
+            },
+            training: {
+                batch_size: parseInt(document.getElementById('batch-size').value),
+                gradient_accumulation_steps: parseInt(document.getElementById('grad-accum').value),
+                optimizer: document.getElementById('optimizer').value,
+                dtype: document.getElementById('dtype').value,
+                activation_checkpointing: parseInt(document.getElementById('activation-checkpointing').value),
+            },
+            parallelism: {
+                tensor_parallel_size: parseInt(document.getElementById('tensor-pp').value),
+                pipeline_parallel_size: parseInt(document.getElementById('pipeline-pp').value),
+                data_parallel_size: parseInt(document.getElementById('data-pp').value),
+                sequence_parallel: document.getElementById('seq-parallel').checked,
+            },
+            engine: {
+                type: engineType,
+                zero_stage: parseInt(document.getElementById('zero-stage').value),
+                offload_optimizer: document.getElementById('offload-optimizer').value,
+                offload_param: document.getElementById('offload-param').value,
+                zero_init: document.getElementById('zero-init').checked,
+                sharding_strategy: document.getElementById('sharding-strategy')?.value || null,
+                use_distributed_optimizer: document.getElementById('use-distributed-optimizer')?.checked || false,
+                num_micro_batches: parseInt(document.getElementById('num-micro-batches')?.value || 1),
+            },
+            hardware: {
+                num_gpus: parseInt(document.getElementById('num-gpus').value),
+                gpu_memory_gb: gpuMem,
+            },
+        };
+    }
+
+    async calculateMemory() {
+        // Client-side validation first
+        const validation = this.validateForm();
+        if (!validation.valid) {
+            // Show validation errors inline
+            this.showError(`Validation error: ${validation.errors[0]}`);
+            return;
+        }
+
+        const config = this.collectFormData();
+        this.lastConfig = config; // Store for formula explanation
+        const calculateBtn = document.getElementById('calculate-btn');
+
+        // Update last calculation time
+        this.lastCalculationTime = Date.now();
+
+        // Show loading state
+        calculateBtn.disabled = true;
+        calculateBtn.textContent = 'Calculating...';
+
+        try {
+            const response = await fetch(`${this.apiBase}/calculate`, {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+                body: JSON.stringify(config),
+            });
+
+            if (!response.ok) {
+                const error = await response.json();
+                const errorMsg = error.detail?.message || error.detail || 'Calculation failed';
+                throw new Error(errorMsg);
+            }
+
+            const result = await response.json();
+            this.displayResults(result);
+        } catch (error) {
+            this.showError(`Calculation failed: ${error.message}`);
+        } finally {
+            calculateBtn.disabled = false;
+            calculateBtn.textContent = 'Calculate';
+        }
+    }
+
+    displayResults(result) {
+        // Main memory results
+        document.getElementById('result-per-gpu').textContent = `${result.total_memory_per_gpu_gb.toFixed(2)} GB`;
+        document.getElementById('result-total').textContent = `${result.total_memory_all_gpus_gb.toFixed(2)} GB`;
+        document.getElementById('result-cpu').textContent = `${result.cpu_memory_gb.toFixed(2)} GB`;
+
+        // Breakdown
+        document.getElementById('breakdown-params').textContent = `${result.breakdown.model_params_gb.toFixed(2)} GB`;
+        document.getElementById('breakdown-grads').textContent = `${result.breakdown.gradients_gb.toFixed(2)} GB`;
+        document.getElementById('breakdown-optimizer').textContent = `${result.breakdown.optimizer_states_gb.toFixed(2)} GB`;
+        document.getElementById('breakdown-activations').textContent = `${result.breakdown.activations_gb.toFixed(2)} GB`;
+        document.getElementById('breakdown-overhead').textContent = `${result.breakdown.overhead_gb.toFixed(2)} GB`;
+
+        // Update bar chart
+        this.updateBarChart(result.breakdown);
+
+        // Feasibility
+        const statusEl = document.getElementById('feasibility-status');
+        const utilEl = document.getElementById('feasibility-util');
+        const recommendedBatchEl = document.getElementById('recommended-batch-container');
+        const recommendedBatchValue = document.getElementById('recommended-batch');
+
+        utilEl.textContent = `${result.memory_utilization_percent.toFixed(1)}%`;
+
+        if (result.fits_on_gpu) {
+            statusEl.textContent = '✓ Fits on GPU';
+            statusEl.className = 'metric-value status-success';
+            recommendedBatchEl.style.display = 'none';
+        } else {
+            statusEl.textContent = '✗ OOM (Out of Memory)';
+            statusEl.className = 'metric-value status-danger';
+            if (result.recommended_batch_size) {
+                recommendedBatchValue.textContent = result.recommended_batch_size;
+                recommendedBatchEl.style.display = 'flex';
+            }
+        }
+
+        // Color code utilization
+        if (result.memory_utilization_percent < 80) {
+            utilEl.className = 'metric-value status-success';
+        } else if (result.memory_utilization_percent < 95) {
+            utilEl.className = 'metric-value status-warning';
+        } else {
+            utilEl.className = 'metric-value status-danger';
+        }
+    }
+
+    updateBarChart(breakdown) {
+        const total = breakdown.model_params_gb + breakdown.gradients_gb +
+                     breakdown.optimizer_states_gb + breakdown.activations_gb;
+
+        const paramsPct = (breakdown.model_params_gb / total) * 100;
+        const gradsPct = (breakdown.gradients_gb / total) * 100;
+        const optimizerPct = (breakdown.optimizer_states_gb / total) * 100;
+        const activationsPct = (breakdown.activations_gb / total) * 100;
+
+        document.getElementById('bar-params').style.width = `${paramsPct}%`;
+        document.getElementById('bar-grads').style.width = `${gradsPct}%`;
+        document.getElementById('bar-optimizer').style.width = `${optimizerPct}%`;
+        document.getElementById('bar-activations').style.width = `${activationsPct}%`;
+    }
+
+    async showFormulaExplanation() {
+        if (!this.lastConfig) {
+            this.showError('Please run a calculation first to see the formula explanation.');
+            return;
+        }
+
+        try {
+            const response = await fetch(`${this.apiBase}/explain-formula`, {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+                body: JSON.stringify(this.lastConfig),
+            });
+
+            if (!response.ok) {
+                throw new Error('Failed to get formula explanation');
+            }
+
+            const formulaInfo = await response.json();
+            this.displayFormulaExplanation(formulaInfo);
+        } catch (error) {
+            this.showError(`Failed to load formula explanation: ${error.message}`);
+        }
+    }
+
+    displayFormulaExplanation(formulaInfo) {
+        // Update formula description
+        const descEl = document.getElementById('formula-description');
+        descEl.innerHTML = `
+            <p><strong>Engine:</strong> ${formulaInfo.engine_name}</p>
+            <p><strong>Total Memory:</strong> ${formulaInfo.total_memory_gb} GB</p>
+            <p>${formulaInfo.formula_description || ''}</p>
+        `;
+
+        // Update formula components
+        const componentsEl = document.getElementById('formula-components');
+        componentsEl.style.display = 'block';
+
+        let componentsHTML = '<h4>Formula Components:</h4><ul class="formula-components-list">';
+        formulaInfo.formula_components.forEach(component => {
+            componentsHTML += `
+                <li>
+                    <div class="component-name">${component.name}</div>
+                    ${component.formula ? `<div class="component-formula">${component.formula}</div>` : ''}
+                    ${component.description ? `<div class="component-calculation">${component.description}</div>` : ''}
+                    <div class="component-result">
+                        <strong>Result:</strong> ${component.result}
+                    </div>
+                </li>
+            `;
+        });
+        componentsHTML += '</ul>';
+        componentsEl.innerHTML = componentsHTML;
+
+        // Update references
+        const refsEl = document.getElementById('references-list');
+        const refsContainer = document.querySelector('.formula-references');
+        refsContainer.style.display = 'block';
+
+        let refsHTML = '';
+        formulaInfo.references.forEach(ref => {
+            refsHTML += `<li><a href="${ref.url}" target="_blank" rel="noopener noreferrer">${ref.title}</a></li>`;
+        });
+        refsEl.innerHTML = refsHTML;
+
+        // Update button text and set visibility flag
+        const btn = document.getElementById('show-formula-btn');
+        btn.textContent = 'Hide Formula Details';
+        this.formulaDetailsVisible = true;
+    }
+
+    hideFormulaExplanation() {
+        document.getElementById('formula-components').style.display = 'none';
+        document.querySelector('.formula-references').style.display = 'none';
+
+        const btn = document.getElementById('show-formula-btn');
+        btn.textContent = 'Show Formula Details';
+        this.formulaDetailsVisible = false;
+    }
+
+    async toggleFormulaExplanation() {
+        if (!this.lastConfig) {
+            this.showError('Please run a calculation first to see the formula explanation.');
+            return;
+        }
+
+        if (this.formulaDetailsVisible) {
+            // Currently visible, hide it
+            this.hideFormulaExplanation();
+        } else {
+            // Currently hidden, show it
+            await this.showFormulaExplanation();
+        }
+    }
+
+    resetForm() {
+        document.getElementById('preset-select').value = 'custom';
+        document.getElementById('model-name').value = 'custom-model';
+        document.getElementById('num-params').value = '7B';
+        document.getElementById('num-layers').value = '32';
+        document.getElementById('hidden-size').value = '4096';
+        document.getElementById('num-heads').value = '32';
+        document.getElementById('vocab-size').value = '32000';
+        document.getElementById('seq-len').value = '4096';
+
+        // Reset MoE fields
+        document.getElementById('moe-enabled').checked = false;
+        document.getElementById('num-experts').value = '8';
+        document.getElementById('top-k').value = '2';
+        document.getElementById('expert-intermediate-size').value = '';
+        document.getElementById('shared-expert-size').value = '';
+        this.toggleMoEFields(false);
+
+        document.getElementById('batch-size').value = '4';
+        document.getElementById('batch-size-slider').value = '4';
+        document.getElementById('grad-accum').value = '4';
+        document.getElementById('optimizer').value = 'adamw';
+        document.getElementById('dtype').value = 'bf16';
+        document.getElementById('activation-checkpointing').value = '2';
+        document.getElementById('tensor-pp').value = '1';
+        document.getElementById('pipeline-pp').value = '1';
+        document.getElementById('data-pp').value = '8';
+        document.getElementById('seq-parallel').checked = false;
+        document.getElementById('engine-type').value = 'deepspeed';
+        document.getElementById('zero-stage').value = '3';
+        document.getElementById('offload-optimizer').value = 'cpu';
+        document.getElementById('offload-param').value = 'none';
+        document.getElementById('zero-init').checked = true;
+        document.getElementById('num-gpus').value = '8';
+        document.getElementById('gpu-model').value = '80';
+
+        this.updateEngineFields('deepspeed');
+        this.updateEffectiveGPUs();
+
+        // Reset results
+        document.getElementById('result-per-gpu').textContent = '-- GB';
+        document.getElementById('result-total').textContent = '-- GB';
+        document.getElementById('result-cpu').textContent = '-- GB';
+        document.getElementById('breakdown-params').textContent = '-- GB';
+        document.getElementById('breakdown-grads').textContent = '-- GB';
+        document.getElementById('breakdown-optimizer').textContent = '-- GB';
+        document.getElementById('breakdown-activations').textContent = '-- GB';
+        document.getElementById('breakdown-overhead').textContent = '-- GB';
+        document.getElementById('feasibility-status').textContent = '--';
+        document.getElementById('feasibility-util').textContent = '--%';
+    }
+
+    saveConfig() {
+        const config = this.collectFormData();
+        const jsonStr = JSON.stringify(config, null, 2);
+        const blob = new Blob([jsonStr], { type: 'application/json' });
+        const url = URL.createObjectURL(blob);
+
+        const a = document.createElement('a');
+        a.href = url;
+        a.download = `gpu-mem-config-${Date.now()}.json`;
+        document.body.appendChild(a);
+        a.click();
+        document.body.removeChild(a);
+        URL.revokeObjectURL(url);
+    }
+
+    async copyConfigJSON() {
+        const config = this.collectFormData();
+        const jsonStr = JSON.stringify(config, null, 2);
+
+        try {
+            await navigator.clipboard.writeText(jsonStr);
+            this.showError('Config copied to clipboard!', true);
+        } catch (error) {
+            // Fallback for older browsers
+            const textarea = document.createElement('textarea');
+            textarea.value = jsonStr;
+            document.body.appendChild(textarea);
+            textarea.select();
+            document.execCommand('copy');
+            document.body.removeChild(textarea);
+            this.showError('Config copied to clipboard!', true);
+        }
+    }
+
+    showError(message, isSuccess = false) {
+        const errorEl = document.getElementById('error-message');
+        errorEl.textContent = message;
+        errorEl.style.display = 'block';
+        errorEl.style.backgroundColor = isSuccess ? 'var(--success-color)' : 'var(--danger-color)';
+
+        setTimeout(() => {
+            errorEl.style.display = 'none';
+        }, 3000);
+    }
+
+    /**
+     * Calculate inference memory
+     */
+    async calculateInferenceMemory() {
+        try {
+            // Helper function to get value or null if empty
+            const getValOrNull = (id) => {
+                const val = document.getElementById(id).value;
+                return val === '' ? null : val;
+            };
+
+            const getIntOrNull = (id) => {
+                const val = document.getElementById(id).value;
+                return val === '' ? null : parseInt(val);
+            };
+
+            const getFloatOrNull = (id) => {
+                const val = document.getElementById(id).value;
+                return val === '' ? null : parseFloat(val);
+            };
+
+            const config = {
+                model: {
+                    name: document.getElementById('inference-model-name').value,
+                    num_parameters: document.getElementById('inference-num-params').value,
+                    num_layers: parseInt(document.getElementById('inference-num-layers').value),
+                    hidden_size: parseInt(document.getElementById('inference-hidden-size').value),
+                    num_attention_heads: parseInt(document.getElementById('inference-num-heads').value),
+                    vocab_size: parseInt(document.getElementById('inference-vocab-size').value),
+                    max_seq_len: parseInt(document.getElementById('inference-seq-len').value),
+                },
+                inference: {
+                    engine_type: document.getElementById('inference-engine').value,
+                    batch_size: parseInt(document.getElementById('inference-batch-size').value),
+                    kv_cache_quantization: document.getElementById('kv-cache-quantization').value,
+                    tensor_parallel_size: parseInt(document.getElementById('tensor-parallel-size').value),
+                    gpu_memory_utilization: parseFloat(document.getElementById('gpu-memory-util').value),
+                    use_kv_cache: document.getElementById('use-kv-cache').checked,
+                    // TGI-specific
+                    max_total_tokens: getIntOrNull('max-total-tokens'),
+                    max_input_tokens: getIntOrNull('max-input-tokens'),
+                    max_batch_total_tokens: getIntOrNull('max-batch-total-tokens'),
+                    tgi_quantize: getValOrNull('tgi-quantize') || 'none',
+                    tgi_dtype: getValOrNull('tgi-dtype') || 'bfloat16',
+                    sharded: document.getElementById('sharded').checked,
+                    num_shard: getIntOrNull('num-shard'),
+                    // vLLM-specific
+                    block_size: getIntOrNull('block-size'),
+                    swap_space_gb: getFloatOrNull('swap-space-gb') || 0.0,
+                    enable_prefix_caching: document.getElementById('enable-prefix-caching').checked,
+                    enforce_eager: document.getElementById('enforce-eager').checked,
+                    max_num_batched_tokens: getIntOrNull('max-num-batched-tokens'),
+                    max_num_seqs: getIntOrNull('max-num-seqs'),
+                    vllm_quantization: getValOrNull('vllm-quantization') || 'none',
+                    // TensorRT-LLM-specific
+                    trt_max_batch_size: getIntOrNull('trt-max-batch-size'),
+                    trt_max_input_len: getIntOrNull('trt-max-input-len'),
+                    trt_max_seq_len: getIntOrNull('trt-max-seq-len'),
+                    trt_max_beam_width: getIntOrNull('trt-max-beam-width'),
+                    // SGLang-specific
+                    chunk_size: getIntOrNull('chunk-size'),
+                    max_running_requests: getIntOrNull('max-running-requests'),
+                    disable_radix_cache: document.getElementById('disable-radix-cache').checked,
+                    enable_p2p: document.getElementById('enable-p2p').checked,
+                    disable_custom_all_reduce: document.getElementById('disable-custom-all-reduce').checked,
+                    attention_backend: getValOrNull('attention-backend') || 'flashinfer',
+                    enable_torch_compile: document.getElementById('enable-torch-compile').checked,
+                    radix_cache_max_seq_len: getIntOrNull('radix-cache-max-seq-len'),
+                    speculative_algo: getValOrNull('speculative-algo') || 'default',
+                    multi_lora_enabled: document.getElementById('multi-lora-enabled').checked,
+                },
+                hardware: {
+                    num_gpus: parseInt(document.getElementById('inference-num-gpus').value),
+                    gpu_memory_gb: parseInt(document.getElementById('inference-gpu-model').value),
+                },
+            };
+
+            const response = await fetch(`${this.apiBase}/inference/calculate`, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(config),
+            });
+
+            if (!response.ok) {
+                throw new Error('Failed to calculate inference memory');
+            }
+
+            const result = await response.json();
+            this.displayInferenceResults(result);
+        } catch (error) {
+            this.showError(`Error: ${error.message}`);
+        }
+    }
+
+    displayInferenceResults(result) {
+        document.getElementById('inference-result-per-gpu').textContent = `${result.total_memory_per_gpu_gb.toFixed(2)} GB`;
+        document.getElementById('inference-result-total').textContent = `${result.total_memory_all_gpus_gb.toFixed(2)} GB`;
+        document.getElementById('inference-result-params').textContent = `${result.breakdown.model_params_gb.toFixed(2)} GB`;
+        document.getElementById('inference-result-kv-cache').textContent = `${result.breakdown.kv_cache_gb.toFixed(2)} GB`;
+        document.getElementById('inference-result-activations').textContent = `${result.breakdown.activations_gb.toFixed(2)} GB`;
+        document.getElementById('inference-max-batch').textContent = result.max_supported_batch_size || 'N/A';
+        document.getElementById('inference-throughput').textContent = result.estimated_throughput_tokens_per_sec
+            ? `${result.estimated_throughput_tokens_per_sec.toFixed(0)} tokens/sec`
+            : 'N/A';
+        document.getElementById('inference-fits').textContent = result.fits_on_gpu ? '✓ Yes' : '✗ No';
+        document.getElementById('inference-fits').style.color = result.fits_on_gpu ? 'var(--success-color)' : 'var(--danger-color)';
+        document.getElementById('inference-utilization').textContent = `${result.memory_utilization_percent.toFixed(1)}%`;
+    }
+
+    resetInferenceForm() {
+        document.getElementById('inference-preset-select').value = 'custom';
+        document.getElementById('inference-model-name').value = 'custom-model';
+        document.getElementById('inference-num-params').value = '7B';
+        document.getElementById('inference-num-layers').value = '32';
+        document.getElementById('inference-hidden-size').value = '4096';
+        document.getElementById('inference-num-heads').value = '32';
+        document.getElementById('inference-vocab-size').value = '32000';
+        document.getElementById('inference-seq-len').value = '4096';
+        document.getElementById('inference-batch-size').value = '32';
+        document.getElementById('kv-cache-quantization').value = 'none';
+        document.getElementById('tensor-parallel-size').value = '1';
+        document.getElementById('gpu-memory-util').value = '0.9';
+        document.getElementById('gpu-memory-util-value').textContent = '0.90';
+        document.getElementById('inference-num-gpus').value = '1';
+        document.getElementById('inference-gpu-model').value = '80';
+        document.getElementById('use-kv-cache').checked = true;
+
+        // Reset TGI-specific fields
+        document.getElementById('max-total-tokens').value = '4096';
+        document.getElementById('max-input-tokens').value = '2048';
+        document.getElementById('max-batch-total-tokens').value = '8192';
+        document.getElementById('tgi-quantize').value = 'none';
+        document.getElementById('tgi-dtype').value = 'bfloat16';
+        document.getElementById('sharded').checked = false;
+        document.getElementById('num-shard').value = '1';
+
+        // Reset vLLM-specific fields
+        document.getElementById('block-size').value = '';
+        document.getElementById('swap-space-gb').value = '0';
+        document.getElementById('enable-prefix-caching').checked = false;
+        document.getElementById('enforce-eager').checked = false;
+        document.getElementById('max-num-batched-tokens').value = '';
+        document.getElementById('max-num-seqs').value = '';
+        document.getElementById('vllm-quantization').value = 'none';
+
+        // Reset TensorRT-LLM-specific fields
+        document.getElementById('trt-max-batch-size').value = '2048';
+        document.getElementById('trt-max-input-len').value = '1024';
+        document.getElementById('trt-max-seq-len').value = '2048';
+        document.getElementById('trt-max-beam-width').value = '1';
+
+        // Reset SGLang-specific fields
+        document.getElementById('chunk-size').value = '8192';
+        document.getElementById('max-running-requests').value = '128';
+        document.getElementById('radix-cache-max-seq-len').value = '8192';
+        document.getElementById('attention-backend').value = 'flashinfer';
+        document.getElementById('speculative-algo').value = 'default';
+        document.getElementById('disable-radix-cache').checked = false;
+        document.getElementById('enable-p2p').checked = false;
+        document.getElementById('disable-custom-all-reduce').checked = false;
+        document.getElementById('enable-torch-compile').checked = false;
+        document.getElementById('multi-lora-enabled').checked = false;
+
+        // Clear results
+        document.getElementById('inference-result-per-gpu').textContent = '-- GB';
+        document.getElementById('inference-result-total').textContent = '-- GB';
+        document.getElementById('inference-result-params').textContent = '-- GB';
+        document.getElementById('inference-result-kv-cache').textContent = '-- GB';
+        document.getElementById('inference-result-activations').textContent = '-- GB';
+        document.getElementById('inference-max-batch').textContent = '--';
+        document.getElementById('inference-throughput').textContent = '-- tokens/sec';
+        document.getElementById('inference-fits').textContent = '--';
+        document.getElementById('inference-utilization').textContent = '--%';
+
+        // Reset engine-specific sections visibility
+        const engineType = document.getElementById('inference-engine').value;
+        this.updateInferenceEngineFields(engineType);
+    }
+
+    /**
+     * Calculate multi-node network overhead
+     */
+    async calculateMultiNode() {
+        try {
+            const config = {
+                model: {
+                    num_parameters: document.getElementById('multinode-num-params').value,
+                },
+                training: {
+                    dtype: document.getElementById('multinode-dtype').value,
+                    batch_size: parseInt(document.getElementById('multinode-batch-size').value),
+                    seq_length: parseInt(document.getElementById('multinode-seq-len').value),
+                },
+                parallelism: {
+                    tensor_parallel_size: parseInt(document.getElementById('multinode-tensor-pp').value),
+                    pipeline_parallel_size: parseInt(document.getElementById('multinode-pipeline-pp').value),
+                    sequence_parallel: document.getElementById('multinode-seq-parallel').checked,
+                },
+                engine: {
+                    type: document.getElementById('multinode-engine').value,
+                    zero_stage: parseInt(document.getElementById('multinode-zero-stage').value),
+                },
+                node_config: {
+                    num_nodes: parseInt(document.getElementById('num-nodes').value),
+                    gpus_per_node: parseInt(document.getElementById('gpus-per-node').value),
+                    interconnect_type: document.getElementById('interconnect-type').value,
+                },
+                optimize_strategy: document.getElementById('multinode-optimize').checked,
+            };
+
+            const response = await fetch(`${this.apiBase}/multinode/calculate`, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(config),
+            });
+
+            if (!response.ok) {
+                throw new Error('Failed to calculate multi-node overhead');
+            }
+
+            const result = await response.json();
+            this.displayMultiNodeResults(result);
+        } catch (error) {
+            this.showError(`Error: ${error.message}`);
+        }
+    }
+
+    displayMultiNodeResults(result) {
+        const overhead = result.network_overhead;
+        document.getElementById('multinode-overhead-total').textContent = `${overhead.total_overhead_gb.toFixed(2)} GB`;
+        document.getElementById('multinode-overhead-allreduce').textContent = `${overhead.allreduce_gb.toFixed(2)} GB`;
+        document.getElementById('multinode-overhead-allgather').textContent = `${overhead.allgather_gb.toFixed(2)} GB`;
+        document.getElementById('multinode-overhead-reducescatter').textContent = `${overhead.reducescatter_gb?.toFixed(2) || '0.00'} GB`;
+        document.getElementById('multinode-overhead-pipeline').textContent = `${overhead.pipeline_gb?.toFixed(2) || '0.00'} GB`;
+        document.getElementById('multinode-time-overhead').textContent = `${overhead.estimated_overhead_ms_per_step?.toFixed(2) || 'N/A'} ms/step`;
+        document.getElementById('multinode-comm-time').textContent = `${overhead.communication_time_ms_per_step?.toFixed(2) || 'N/A'} ms/step`;
+        document.getElementById('multinode-latency').textContent = `${overhead.latency_overhead_ms?.toFixed(2) || 'N/A'} ms`;
+
+        // Display suggestions
+        const suggestionsDiv = document.getElementById('multinode-suggestions');
+        if (result.suggestions && result.suggestions.length > 0) {
+            suggestionsDiv.innerHTML = '<ul>' + result.suggestions.map(s => `<li>${s}</li>`).join('') + '</ul>';
+        } else {
+            suggestionsDiv.innerHTML = '<p>No optimization suggestions available.</p>';
+        }
+    }
+
+    resetMultiNodeForm() {
+        document.getElementById('multinode-preset-select').value = 'custom';
+        document.getElementById('multinode-num-params').value = '7B';
+        document.getElementById('multinode-dtype').value = 'bf16';
+        document.getElementById('num-nodes').value = '2';
+        document.getElementById('gpus-per-node').value = '8';
+        document.getElementById('multinode-total-gpus').textContent = '16';
+        document.getElementById('interconnect-type').value = 'infiniband';
+        document.getElementById('multinode-engine').value = 'deepspeed';
+        document.getElementById('multinode-zero-stage').value = '3';
+        document.getElementById('multinode-batch-size').value = '4';
+        document.getElementById('multinode-seq-len').value = '4096';
+        document.getElementById('multinode-tensor-pp').value = '1';
+        document.getElementById('multinode-pipeline-pp').value = '1';
+        document.getElementById('multinode-seq-parallel').checked = false;
+        document.getElementById('multinode-optimize').checked = true;
+
+        // Clear results
+        document.getElementById('multinode-overhead-total').textContent = '-- GB';
+        document.getElementById('multinode-overhead-allreduce').textContent = '-- GB';
+        document.getElementById('multinode-overhead-allgather').textContent = '-- GB';
+        document.getElementById('multinode-overhead-reducescatter').textContent = '-- GB';
+        document.getElementById('multinode-overhead-pipeline').textContent = '-- GB';
+        document.getElementById('multinode-time-overhead').textContent = '-- ms/step';
+        document.getElementById('multinode-comm-time').textContent = '-- ms/step';
+        document.getElementById('multinode-latency').textContent = '-- ms';
+        document.getElementById('multinode-suggestions').innerHTML = '<p>Run calculation to see optimization suggestions.</p>';
+    }
+
+    applyInferenceConfig(config) {
+        // Apply model configuration to inference form
+        if (config.model) {
+            if (config.model.name) {
+                document.getElementById('inference-model-name').value = config.model.name;
+            }
+            if (config.model.num_parameters) {
+                document.getElementById('inference-num-params').value = config.model.num_parameters;
+            }
+            if (config.model.num_layers) {
+                document.getElementById('inference-num-layers').value = config.model.num_layers;
+            }
+            if (config.model.hidden_size) {
+                document.getElementById('inference-hidden-size').value = config.model.hidden_size;
+            }
+            if (config.model.num_attention_heads) {
+                document.getElementById('inference-num-heads').value = config.model.num_attention_heads;
+            }
+            if (config.model.vocab_size) {
+                document.getElementById('inference-vocab-size').value = config.model.vocab_size;
+            }
+            if (config.model.max_seq_len) {
+                document.getElementById('inference-seq-len').value = config.model.max_seq_len;
+            }
+        }
+    }
+
+    applyMultiNodeConfig(config) {
+        // Apply model configuration to multinode form
+        if (config.model) {
+            if (config.model.num_parameters) {
+                document.getElementById('multinode-num-params').value = config.model.num_parameters;
+            }
+        }
+    }
+
+    /**
+     * Show export framework modal
+     */
+    showExportModal() {
+        const format = prompt('Select export format:\n1 - Accelerate\n2 - Lightning\n3 - Axolotl\n4 - DeepSpeed\n5 - YAML\n6 - JSON\n\nEnter number (1-6):');
+
+        if (!format) return;
+
+        const formatMap = {
+            '1': 'accelerate',
+            '2': 'lightning',
+            '3': 'axolotl',
+            '4': 'deepspeed',
+            '5': 'yaml',
+            '6': 'json',
+        };
+
+        const selectedFormat = formatMap[format];
+        if (!selectedFormat) {
+            this.showError('Invalid format selected');
+            return;
+        }
+
+        this.exportFrameworkConfig(selectedFormat);
+    }
+
+    async exportFrameworkConfig(format) {
+        try {
+            const config = this.collectFormData();
+            const response = await fetch(`${this.apiBase}/export/${format}`, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(config),
+            });
+
+            if (!response.ok) {
+                throw new Error(`Failed to export ${format} config`);
+            }
+
+            const result = await response.json();
+            this.downloadConfig(result, format);
+        } catch (error) {
+            this.showError(`Error: ${error.message}`);
+        }
+    }
+}
+
+// Initialize the calculator when DOM is ready
+document.addEventListener('DOMContentLoaded', () => {
+    new GPUMemCalculator();
+});
diff --git a/web/templates/index.html b/web/templates/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..89da3e8b9904998ef0417d291a99793eb9c34deb
--- /dev/null
+++ b/web/templates/index.html
@@ -0,0 +1,999 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GPU Memory Calculator for LLM Training</title>
+    <link rel="stylesheet" href="/static/css/styles.css">
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>🚀 GPU Memory Calculator</h1>
+            <p class="subtitle">For LLM Training, Inference, and Multi-Node Optimization</p>
+        </header>
+
+        <!-- Tab Navigation -->
+        <nav class="tab-navigation">
+            <button class="tab-btn active" data-tab="training">🎓 Training</button>
+            <button class="tab-btn" data-tab="inference">🚀 Inference</button>
+            <button class="tab-btn" data-tab="multinode">🌐 Multi-Node</button>
+        </nav>
+
+        <div class="main-content">
+            <!-- Training Tab -->
+            <div id="training-tab" class="tab-content active">
+            <!-- Configuration Panel -->
+            <div class="config-panel">
+                <h2>Training Configuration</h2>
+
+                <!-- Model Settings -->
+                <section class="config-section">
+                    <h3>Model Settings</h3>
+                    <div class="form-group">
+                        <label for="preset-select">Preset Model:</label>
+                        <select id="preset-select">
+                            <option value="custom">Custom</option>
+                            <optgroup label="Dense Models">
+                                <option value="llama2-7b">LLaMA 2 7B</option>
+                                <option value="llama2-13b">LLaMA 2 13B</option>
+                                <option value="llama2-70b">LLaMA 2 70B</option>
+                                <option value="gpt3-175b">GPT-3 175B</option>
+                            </optgroup>
+                            <optgroup label="MoE (Mixture of Experts) Models">
+                                <option value="glm-4.7-355b">GLM-4.7 355B (MoE) ⭐ Latest</option>
+                                <option value="glm-4.5-air-106b">GLM-4.5 Air 106B (MoE) ⭐ Air</option>
+                                <option value="glm-4-9b">GLM-4 9B (MoE)</option>
+                                <option value="mixtral-8x7b">Mixtral 8x7B (MoE)</option>
+                                <option value="qwen1.5-moe-a2.7b">Qwen1.5-MoE-A2.7B</option>
+                                <option value="deepseek-moe-16b">DeepSeek-MoE 16B</option>
+                            </optgroup>
+                        </select>
+                    </div>
+
+                    <div class="form-grid">
+                        <div class="form-group" data-tooltip="Name of your model">
+                            <label for="model-name" id="label-model-name">Model Name:</label>
+                            <input type="text" id="model-name" value="custom-model" aria-labelledby="label-model-name">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Total number of parameters (e.g., 7B, 7000M, 7000000000)">
+                            <label for="num-params" id="label-num-params">Parameters:</label>
+                            <input type="text" id="num-params" value="7B" placeholder="e.g., 7B" aria-labelledby="label-num-params" aria-describedby="help-num-params">
+                            <span id="help-num-params" class="sr-only">Enter model size as number with optional suffix: 7B, 7000M, or 7000000000</span>
+                        </div>
+
+                        <div class="form-group" data-tooltip="Number of transformer layers">
+                            <label for="num-layers" id="label-num-layers">Layers:</label>
+                            <input type="number" id="num-layers" value="32" min="1" aria-labelledby="label-num-layers">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Hidden dimension size">
+                            <label for="hidden-size" id="label-hidden-size">Hidden Size:</label>
+                            <input type="number" id="hidden-size" value="4096" min="1" aria-labelledby="label-hidden-size">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Number of attention heads">
+                            <label for="num-heads" id="label-num-heads">Attention Heads:</label>
+                            <input type="number" id="num-heads" value="32" min="1" aria-labelledby="label-num-heads">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Vocabulary size">
+                            <label for="vocab-size" id="label-vocab-size">Vocab Size:</label>
+                            <input type="number" id="vocab-size" value="32000" min="1" aria-labelledby="label-vocab-size">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Maximum sequence length">
+                            <label for="seq-len" id="label-seq-len">Max Seq Length:</label>
+                            <input type="number" id="seq-len" value="4096" min="1" aria-labelledby="label-seq-len">
+                        </div>
+                    </div>
+                </section>
+
+                <!-- MoE (Mixture of Experts) Settings -->
+                <section class="config-section">
+                    <h3>Mixture of Experts (MoE)</h3>
+                    <div class="form-group" data-tooltip="Enable Mixture of Experts architecture">
+                        <label for="moe-enabled">
+                            <input type="checkbox" id="moe-enabled">
+                            Enable MoE
+                        </label>
+                    </div>
+
+                    <div id="moe-fields" style="display:none;">
+                        <div class="form-grid">
+                            <div class="form-group" data-tooltip="Total number of experts in the model">
+                                <label for="num-experts">Number of Experts:</label>
+                                <input type="number" id="num-experts" value="8" min="1" max="256">
+                            </div>
+
+                            <div class="form-group" data-tooltip="Number of experts activated per token (top-k routing)">
+                                <label for="top-k">Top-K (active experts):</label>
+                                <input type="number" id="top-k" value="2" min="1" max="8">
+                            </div>
+
+                            <div class="form-group" data-tooltip="Expert intermediate layer size (default: 4x hidden_size)">
+                                <label for="expert-intermediate-size">Expert Intermediate Size:</label>
+                                <input type="number" id="expert-intermediate-size" value="" placeholder="Auto (4x hidden)" min="1">
+                            </div>
+
+                            <div class="form-group" data-tooltip="Shared expert intermediate size (for models like GLM)">
+                                <label for="shared-expert-size">Shared Expert Size:</label>
+                                <input type="number" id="shared-expert-size" value="" placeholder="None" min="1">
+                            </div>
+                        </div>
+                        <p class="info-text">With MoE, only <strong><span id="active-experts-display">2</span></strong> of <strong><span id="total-experts-display">8</span></strong> experts are active per token, reducing activation memory.</p>
+                    </div>
+                </section>
+
+                <!-- Training Settings -->
+                <section class="config-section">
+                    <h3>Training Settings</h3>
+                    <div class="form-grid">
+                        <div class="form-group">
+                            <label for="batch-size" data-tooltip="Batch size per GPU">Batch Size:</label>
+                            <input type="number" id="batch-size" value="4" min="1">
+                            <input type="range" id="batch-size-slider" min="1" max="128" value="4">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Gradient accumulation steps">
+                            <label for="grad-accum">Gradient Accumulation:</label>
+                            <input type="number" id="grad-accum" value="4" min="1">
+                        </div>
+
+                        <div class="form-group">
+                            <label for="optimizer" data-tooltip="Optimizer type">Optimizer:</label>
+                            <select id="optimizer">
+                                <option value="adamw">AdamW</option>
+                                <option value="adam">Adam</option>
+                                <option value="sgd">SGD</option>
+                                <option value="adamw_8bit">AdamW 8-bit</option>
+                            </select>
+                        </div>
+
+                        <div class="form-group">
+                            <label for="dtype" data-tooltip="Data type for training">Precision:</label>
+                            <select id="dtype">
+                                <option value="bf16" selected>BF16</option>
+                                <option value="fp16">FP16</option>
+                                <option value="fp32">FP32</option>
+                                <option value="int8">INT8</option>
+                                <option value="int4">INT4</option>
+                            </select>
+                        </div>
+
+                        <div class="form-group">
+                            <label for="activation-checkpointing" data-tooltip="Activation checkpointing level (0=none, 4=full)">
+                                Activation Checkpointing:
+                            </label>
+                            <select id="activation-checkpointing">
+                                <option value="0">0: None (most memory)</option>
+                                <option value="1">1: Checkpoint attention output</option>
+                                <option value="2" selected>2: Checkpoint attention input</option>
+                                <option value="3">3: Checkpoint layer + attention</option>
+                                <option value="4">4: Full checkpointing (least memory)</option>
+                            </select>
+                        </div>
+                    </div>
+                </section>
+
+                <!-- Parallelism Settings -->
+                <section class="config-section">
+                    <h3>Parallelism</h3>
+                    <div class="form-grid">
+                        <div class="form-group" data-tooltip="Tensor parallelism degree">
+                            <label for="tensor-pp">Tensor PP:</label>
+                            <input type="number" id="tensor-pp" value="1" min="1" max="8">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Pipeline parallelism degree">
+                            <label for="pipeline-pp">Pipeline PP:</label>
+                            <input type="number" id="pipeline-pp" value="1" min="1" max="16">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Data parallelism degree">
+                            <label for="data-pp">Data PP:</label>
+                            <input type="number" id="data-pp" value="8" min="1">
+                        </div>
+
+                        <div class="form-group" data-tooltip="Enable sequence parallelism">
+                            <label for="seq-parallel">
+                                <input type="checkbox" id="seq-parallel">
+                                Sequence Parallel
+                            </label>
+                        </div>
+                    </div>
+                    <p class="info-text">Effective GPUs: <span id="effective-gpus">8</span></p>
+                </section>
+
+                <!-- Engine Settings -->
+                <section class="config-section">
+                    <h3>Training Engine</h3>
+                    <div class="form-group">
+                        <label for="engine-type" data-tooltip="Training framework/engine">Engine Type:</label>
+                        <select id="engine-type">
+                            <option value="pytorch_ddp">PyTorch DDP</option>
+                            <option value="deepspeed" selected>DeepSpeed ZeRO</option>
+                            <option value="megatron_lm">Megatron-LM</option>
+                            <option value="fsdp">PyTorch FSDP</option>
+                            <option value="megatron_deepspeed">Megatron + DeepSpeed</option>
+                        </select>
+                    </div>
+
+                    <div id="engine-options">
+                        <!-- Dynamic fields based on engine type -->
+                        <!-- DeepSpeed ZeRO options -->
+                        <div class="form-group" id="zero-stage-group">
+                            <label for="zero-stage" data-tooltip="DeepSpeed ZeRO stage (0-3)">ZeRO Stage:</label>
+                            <select id="zero-stage">
+                                <option value="0">0: Disabled</option>
+                                <option value="1">1: Shard optimizer states</option>
+                                <option value="2">2: Shard optimizer + gradients</option>
+                                <option value="3" selected>3: Shard everything</option>
+                            </select>
+                        </div>
+
+                        <div class="form-group" id="offload-opt-group">
+                            <label for="offload-optimizer" data-tooltip="CPU offload for optimizer states">Offload Optimizer:</label>
+                            <select id="offload-optimizer">
+                                <option value="none">None</option>
+                                <option value="cpu" selected>CPU</option>
+                                <option value="nvme">NVMe</option>
+                            </select>
+                        </div>
+
+                        <div class="form-group" id="offload-param-group">
+                            <label for="offload-param" data-tooltip="CPU offload for parameters">Offload Parameters:</label>
+                            <select id="offload-param">
+                                <option value="none" selected>None</option>
+                                <option value="cpu">CPU</option>
+                                <option value="nvme">NVMe</option>
+                            </select>
+                        </div>
+
+                        <!-- ZeRO-Init option -->
+                        <div class="form-group" id="zero-init-group">
+                            <label for="zero-init" data-tooltip="Use ZeRO initialization (reduces memory during init)">
+                                <input type="checkbox" id="zero-init" checked>
+                                ZeRO Init (ZeRO-3)
+                            </label>
+                        </div>
+
+                        <!-- FSDP Sharding Strategy -->
+                        <div class="form-group" id="sharding-strategy-group" style="display:none;">
+                            <label for="sharding-strategy" data-tooltip="FSDP sharding strategy">Sharding Strategy:</label>
+                            <select id="sharding-strategy">
+                                <option value="no_shard">No Sharding (like DDP)</option>
+                                <option value="shard_grad_op">Shard Gradients + Optimizer (ZeRO-2)</option>
+                                <option value="full_shard" selected>Full Shard (ZeRO-3)</option>
+                            </select>
+                        </div>
+
+                        <!-- Megatron-specific options -->
+                        <div class="form-group" id="megatron-options" style="display:none;">
+                            <label class="group-label">Megatron-LM Options:</label>
+                            <div class="form-group" style="margin-top: 10px;">
+                                <label for="model-parallelism" data-tooltip="Model parallelism strategy">
+                                    <input type="checkbox" id="use-distributed-optimizer">
+                                    Use Distributed Optimizer
+                                </label>
+                            </div>
+                            <div class="form-group" style="margin-top: 5px;">
+                                <label for="num-micro-batches" data-tooltip="Number of micro-batches for pipeline parallelism">
+                                    Num Micro-Batches (PP):
+                                    <input type="number" id="num-micro-batches" value="1" min="1" max="128">
+                                </label>
+                            </div>
+                        </div>
+
+                        <!-- Advanced Training Options -->
+                        <div class="form-group" style="margin-top: 15px;">
+                            <label class="group-label">Advanced Training Options:</label>
+
+                            <div class="form-group" style="margin-top: 10px;">
+                                <label for="gradient-clipping" data-tooltip="Gradient clipping threshold (0 = disabled)">
+                                    Gradient Clipping:
+                                    <input type="number" id="gradient-clipping" value="1.0" min="0" step="0.1">
+                                </label>
+                            </div>
+
+                            <div class="form-group" style="margin-top: 5px;">
+                                <label for="weight-decay" data-tooltip="Weight decay for regularization">Weight Decay:</label>
+                                <input type="number" id="weight-decay" value="0.01" min="0" step="0.001">
+                            </div>
+
+                            <div class="form-group" style="margin-top: 5px;">
+                                <label for="lr" data-tooltip="Learning rate (for reference)">Learning Rate:</label>
+                                <input type="number" id="lr" value="0.0001" min="0" step="0.00001">
+                            </div>
+
+                            <div class="form-group" style="margin-top: 5px;">
+                                <label for="warmup-steps" data-tooltip="Learning rate warmup steps">Warmup Steps:</label>
+                                <input type="number" id="warmup-steps" value="2000" min="0">
+                            </div>
+                        </div>
+                    </div>
+                </section>
+
+                <!-- Hardware Settings -->
+                <section class="config-section">
+                    <h3>Hardware</h3>
+                    <div class="form-grid">
+                        <div class="form-group" data-tooltip="Number of GPUs">
+                            <label for="num-gpus">Number of GPUs:</label>
+                            <input type="number" id="num-gpus" value="8" min="1" max="1024">
+                        </div>
+
+                        <div class="form-group" data-tooltip="GPU model and memory per GPU">
+                            <label for="gpu-model">GPU Model:</label>
+                            <select id="gpu-model">
+                                <option value="16">RTX 4090 - 24GB</option>
+                                <option value="32">V100 - 32GB</option>
+                                <option value="40">A100 - 40GB</option>
+                                <option value="80" selected>A100 - 80GB / H100 - 80GB</option>
+                                <option value="141">H200 - 141GB</option>
+                                <option value="192">B200 - 192GB</option>
+                                <option value="custom">Custom</option>
+                            </select>
+                            <input type="number" id="gpu-mem-custom" value="80" min="1" style="display:none">
+                        </div>
+                    </div>
+                </section>
+
+                <!-- Calculate Buttons -->
+                <div class="button-group">
+                    <button id="calculate-btn" class="btn-primary">Calculate</button>
+                    <button id="reset-btn" class="btn-secondary">Reset</button>
+                </div>
+            </div>
+
+            <!-- Results Panel -->
+            <div class="results-panel">
+                <h2>Results</h2>
+
+                <div class="result-card">
+                    <h3>Memory Breakdown</h3>
+                    <div class="metric">
+                        <span class="metric-label">Per GPU:</span>
+                        <span class="metric-value" id="result-per-gpu">-- GB</span>
+                    </div>
+                    <div class="metric">
+                        <span class="metric-label">Total All GPUs:</span>
+                        <span class="metric-value" id="result-total">-- GB</span>
+                    </div>
+                    <div class="metric">
+                        <span class="metric-label">CPU Memory:</span>
+                        <span class="metric-value" id="result-cpu">-- GB</span>
+                    </div>
+                </div>
+
+                <div class="result-card">
+                    <h3>Component Breakdown</h3>
+                    <div class="breakdown-item">
+                        <span class="breakdown-label">Model Parameters:</span>
+                        <span class="breakdown-value" id="breakdown-params">-- GB</span>
+                    </div>
+                    <div class="breakdown-item">
+                        <span class="breakdown-label">Gradients:</span>
+                        <span class="breakdown-value" id="breakdown-grads">-- GB</span>
+                    </div>
+                    <div class="breakdown-item">
+                        <span class="breakdown-label">Optimizer States:</span>
+                        <span class="breakdown-value" id="breakdown-optimizer">-- GB</span>
+                    </div>
+                    <div class="breakdown-item">
+                        <span class="breakdown-label">Activations:</span>
+                        <span class="breakdown-value" id="breakdown-activations">-- GB</span>
+                    </div>
+                    <div class="breakdown-item">
+                        <span class="breakdown-label">Overhead:</span>
+                        <span class="breakdown-value" id="breakdown-overhead">-- GB</span>
+                    </div>
+
+                    <!-- Simple bar chart -->
+                    <div class="bar-chart" id="breakdown-chart">
+                        <div class="bar" id="bar-params" style="width: 0%" title="Model Parameters"></div>
+                        <div class="bar" id="bar-grads" style="width: 0%" title="Gradients"></div>
+                        <div class="bar" id="bar-optimizer" style="width: 0%" title="Optimizer States"></div>
+                        <div class="bar" id="bar-activations" style="width: 0%" title="Activations"></div>
+                    </div>
+                    <div class="chart-legend">
+                        <span class="legend-item"><span class="legend-color params"></span>Params</span>
+                        <span class="legend-item"><span class="legend-color grads"></span>Grads</span>
+                        <span class="legend-item"><span class="legend-color optimizer"></span>Opt</span>
+                        <span class="legend-item"><span class="legend-color activations"></span>Act</span>
+                    </div>
+                </div>
+
+                <div class="result-card">
+                    <h3>Feasibility</h3>
+                    <div class="metric">
+                        <span class="metric-label">Status:</span>
+                        <span class="metric-value" id="feasibility-status">--</span>
+                    </div>
+                    <div class="metric">
+                        <span class="metric-label">Utilization:</span>
+                        <span class="metric-value" id="feasibility-util">--%</span>
+                    </div>
+                    <div class="metric" id="recommended-batch-container" style="display:none">
+                        <span class="metric-label">Recommended Batch:</span>
+                        <span class="metric-value" id="recommended-batch">--</span>
+                    </div>
+                </div>
+
+                <div class="result-card">
+                    <h3>Formula Explanation</h3>
+                    <div id="formula-description" class="formula-description">
+                        <p>Run a calculation to see the formula breakdown.</p>
+                    </div>
+                    <div id="formula-components" style="display:none;">
+                        <!-- Formula components will be inserted here -->
+                    </div>
+                    <div class="formula-references" style="display:none;">
+                        <h4>References:</h4>
+                        <ul id="references-list"></ul>
+                    </div>
+                    <button id="show-formula-btn" class="btn-secondary" style="margin-top: 10px; width: 100%;">
+                        Show Formula Details
+                    </button>
+                </div>
+
+                <div class="button-group">
+                    <button id="save-config-btn" class="btn-secondary">Save Config</button>
+                    <button id="copy-json-btn" class="btn-secondary">Copy JSON</button>
+                    <button id="export-framework-btn" class="btn-secondary">⬇️ Export Framework Config</button>
+                </div>
+            </div>
+            </div><!-- End Training Tab -->
+
+            <!-- Inference Tab -->
+            <div id="inference-tab" class="tab-content" style="display:none;">
+                <div class="config-panel">
+                    <h2>Inference Configuration</h2>
+
+                    <!-- Model Settings -->
+                    <section class="config-section">
+                        <h3>Model Settings</h3>
+                        <div class="form-group">
+                            <label for="inference-preset-select">Preset Model:</label>
+                            <select id="inference-preset-select">
+                                <option value="custom">Custom</option>
+                                <optgroup label="Dense Models">
+                                    <option value="llama2-7b">LLaMA 2 7B</option>
+                                    <option value="llama2-13b">LLaMA 2 13B</option>
+                                    <option value="llama2-70b">LLaMA 2 70B</option>
+                                    <option value="gpt3-175b">GPT-3 175B</option>
+                                </optgroup>
+                                <optgroup label="MoE (Mixture of Experts) Models">
+                                    <option value="glm-4.7-355b">GLM-4.7 355B (MoE) ⭐ Latest</option>
+                                    <option value="glm-4.5-air-106b">GLM-4.5 Air 106B (MoE) ⭐ Air</option>
+                                    <option value="glm-4-9b">GLM-4 9B (MoE)</option>
+                                    <option value="mixtral-8x7b">Mixtral 8x7B (MoE)</option>
+                                    <option value="qwen1.5-moe-a2.7b">Qwen1.5-MoE-A2.7B</option>
+                                    <option value="deepseek-moe-16b">DeepSeek-MoE 16B</option>
+                                </optgroup>
+                            </select>
+                        </div>
+
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="inference-model-name">Model Name:</label>
+                                <input type="text" id="inference-model-name" value="custom-model">
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-num-params">Parameters:</label>
+                                <input type="text" id="inference-num-params" value="7B" placeholder="e.g., 7B">
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-num-layers">Layers:</label>
+                                <input type="number" id="inference-num-layers" value="32" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-hidden-size">Hidden Size:</label>
+                                <input type="number" id="inference-hidden-size" value="4096" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-num-heads">Attention Heads:</label>
+                                <input type="number" id="inference-num-heads" value="32" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-vocab-size">Vocab Size:</label>
+                                <input type="number" id="inference-vocab-size" value="32000" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-seq-len">Max Seq Length:</label>
+                                <input type="number" id="inference-seq-len" value="4096" min="1">
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- Inference Settings -->
+                    <section class="config-section">
+                        <h3>Inference Settings</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="inference-engine" data-tooltip="Inference engine to use">Inference Engine:</label>
+                                <select id="inference-engine">
+                                    <option value="huggingface">HuggingFace Transformers</option>
+                                    <option value="vllm" selected>vLLM (Recommended)</option>
+                                    <option value="tgi">TGI (HuggingFace TGI)</option>
+                                    <option value="tensorrt_llm">TensorRT-LLM</option>
+                                    <option value="sglang">SGLang</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-batch-size">Batch Size:</label>
+                                <input type="number" id="inference-batch-size" value="32" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="kv-cache-quantization" data-tooltip="KV cache quantization type">KV Cache Quantization:</label>
+                                <select id="kv-cache-quantization">
+                                    <option value="none" selected>NONE (FP16)</option>
+                                    <option value="int8">INT8 (2x compression)</option>
+                                    <option value="fp8">FP8 (4x compression)</option>
+                                    <option value="int4">INT4 (8x compression)</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="tensor-parallel-size" data-tooltip="Number of GPUs for tensor parallelism">Tensor Parallel Size:</label>
+                                <input type="number" id="tensor-parallel-size" value="1" min="1" max="8">
+                            </div>
+                            <div class="form-group">
+                                <label for="gpu-memory-util" data-tooltip="GPU memory utilization (0.0-1.0)">GPU Memory Utilization:</label>
+                                <input type="range" id="gpu-memory-util" min="0.5" max="0.95" step="0.05" value="0.9">
+                                <span id="gpu-memory-util-value">0.90</span>
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-gpu-model">GPU Model:</label>
+                                <select id="inference-gpu-model">
+                                    <option value="24">RTX 4090 - 24GB</option>
+                                    <option value="32">V100 - 32GB</option>
+                                    <option value="40">A100 - 40GB</option>
+                                    <option value="80" selected>A100 - 80GB / H100 - 80GB</option>
+                                    <option value="141">H200 - 141GB</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="inference-num-gpus">Number of GPUs:</label>
+                                <input type="number" id="inference-num-gpus" value="1" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="use-kv-cache">Enable KV Cache:</label>
+                                <input type="checkbox" id="use-kv-cache" checked>
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- TGI-specific Settings -->
+                    <section class="config-section" id="tgi-settings" style="display:none;">
+                        <h3>TGI-Specific Settings</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="max-total-tokens" data-tooltip="Most important: defines memory budget (input + output)">Max Total Tokens:</label>
+                                <input type="number" id="max-total-tokens" value="4096" min="1" placeholder="e.g., 4096">
+                            </div>
+                            <div class="form-group">
+                                <label for="max-input-tokens">Max Input Tokens:</label>
+                                <input type="number" id="max-input-tokens" value="2048" min="1" placeholder="e.g., 2048">
+                            </div>
+                            <div class="form-group">
+                                <label for="max-batch-total-tokens">Max Batch Total Tokens:</label>
+                                <input type="number" id="max-batch-total-tokens" value="8192" min="1" placeholder="e.g., 8192">
+                            </div>
+                            <div class="form-group">
+                                <label for="tgi-quantize">Weight Quantization:</label>
+                                <select id="tgi-quantize">
+                                    <option value="none" selected>NONE</option>
+                                    <option value="awq">AWQ</option>
+                                    <option value="eetq">EETQ</option>
+                                    <option value="exl2">EXL2</option>
+                                    <option value="gptq">GPTQ</option>
+                                    <option value="marlin">Marlin</option>
+                                    <option value="bitsandbytes">BitsAndBytes (8-bit)</option>
+                                    <option value="bitsandbytes-nf4">BitsAndBytes NF4</option>
+                                    <option value="bitsandbytes-fp4">BitsAndBytes FP4</option>
+                                    <option value="fp8">FP8</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="tgi-dtype">Data Type:</label>
+                                <select id="tgi-dtype">
+                                    <option value="float16">Float16</option>
+                                    <option value="bfloat16" selected>BFloat16</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="sharded">Enable Sharded:</label>
+                                <input type="checkbox" id="sharded">
+                            </div>
+                            <div class="form-group">
+                                <label for="num-shard">Number of Shards:</label>
+                                <input type="number" id="num-shard" value="1" min="1" placeholder="Auto if empty">
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- vLLM-specific Settings -->
+                    <section class="config-section" id="vllm-settings" style="display:none;">
+                        <h3>vLLM-Specific Settings</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="block-size" data-tooltip="Block size for paged KV cache (default: 16)">Block Size:</label>
+                                <select id="block-size">
+                                    <option value="">Auto (16)</option>
+                                    <option value="1">1</option>
+                                    <option value="8">8</option>
+                                    <option value="16" selected>16</option>
+                                    <option value="32">32</option>
+                                    <option value="64">64</option>
+                                    <option value="128">128</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="swap-space-gb">CPU Swap Space (GB):</label>
+                                <input type="number" id="swap-space-gb" value="0" min="0" step="0.1">
+                            </div>
+                            <div class="form-group">
+                                <label for="enable-prefix-caching">Enable Prefix Caching:</label>
+                                <input type="checkbox" id="enable-prefix-caching">
+                            </div>
+                            <div class="form-group">
+                                <label for="enforce-eager">Enable Eager Mode:</label>
+                                <input type="checkbox" id="enforce-eager">
+                            </div>
+                            <div class="form-group">
+                                <label for="max-num-batched-tokens">Max Batched Tokens:</label>
+                                <input type="number" id="max-num-batched-tokens" placeholder="Auto if empty">
+                            </div>
+                            <div class="form-group">
+                                <label for="max-num-seqs">Max Sequences per Batch:</label>
+                                <input type="number" id="max-num-seqs" placeholder="Auto if empty">
+                            </div>
+                            <div class="form-group">
+                                <label for="vllm-quantization">Weight Quantization:</label>
+                                <select id="vllm-quantization">
+                                    <option value="none" selected>NONE</option>
+                                    <option value="awq">AWQ</option>
+                                    <option value="gptq">GPTQ</option>
+                                    <option value="squeezellm">SqueezeLLM</option>
+                                    <option value="fp8">FP8</option>
+                                </select>
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- TensorRT-LLM-specific Settings -->
+                    <section class="config-section" id="tensorrt-settings" style="display:none;">
+                        <h3>TensorRT-LLM-Specific Settings</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="trt-max-batch-size">Max Batch Size:</label>
+                                <input type="number" id="trt-max-batch-size" value="2048" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="trt-max-input-len">Max Input Length:</label>
+                                <input type="number" id="trt-max-input-len" value="1024" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="trt-max-seq-len">Max Sequence Length:</label>
+                                <input type="number" id="trt-max-seq-len" value="2048" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="trt-max-beam-width">Max Beam Width:</label>
+                                <input type="number" id="trt-max-beam-width" value="1" min="1">
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- SGLang-specific Settings -->
+                    <section class="config-section" id="sglang-settings" style="display:none;">
+                        <h3>SGLang-Specific Settings</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="chunk-size" data-tooltip="Prefill chunk size for long contexts (default: 8192)">Chunk Size:</label>
+                                <input type="number" id="chunk-size" value="8192" min="1" placeholder="e.g., 8192">
+                            </div>
+                            <div class="form-group">
+                                <label for="max-running-requests" data-tooltip="Maximum number of concurrent requests">Max Running Requests:</label>
+                                <input type="number" id="max-running-requests" value="128" min="1" placeholder="e.g., 128">
+                            </div>
+                            <div class="form-group">
+                                <label for="radix-cache-max-seq-len" data-tooltip="Maximum sequence length for RadixCache">RadixCache Max Seq Len:</label>
+                                <input type="number" id="radix-cache-max-seq-len" value="8192" min="1" placeholder="e.g., 8192">
+                            </div>
+                            <div class="form-group">
+                                <label for="attention-backend" data-tooltip="Attention backend implementation">Attention Backend:</label>
+                                <select id="attention-backend">
+                                    <option value="flashinfer" selected>FlashInfer</option>
+                                    <option value="triton">Triton</option>
+                                    <option value="torch">Torch</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="speculative-algo" data-tooltip="Speculative decoding algorithm">Speculative Algorithm:</label>
+                                <select id="speculative-algo">
+                                    <option value="default" selected>Default</option>
+                                    <option value="medusa">Medusa</option>
+                                    <option value="eagle">EAGLE</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="disable-radix-cache">Disable RadixCache:</label>
+                                <input type="checkbox" id="disable-radix-cache">
+                            </div>
+                            <div class="form-group">
+                                <label for="enable-p2p">Enable P2P Attention:</label>
+                                <input type="checkbox" id="enable-p2p">
+                            </div>
+                            <div class="form-group">
+                                <label for="disable-custom-all-reduce">Disable Custom All-Reduce:</label>
+                                <input type="checkbox" id="disable-custom-all-reduce">
+                            </div>
+                            <div class="form-group">
+                                <label for="enable-torch-compile">Enable torch.compile:</label>
+                                <input type="checkbox" id="enable-torch-compile">
+                            </div>
+                            <div class="form-group">
+                                <label for="multi-lora-enabled">Enable Multi-LoRA:</label>
+                                <input type="checkbox" id="multi-lora-enabled">
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- Calculate Button -->
+                    <div class="button-group">
+                        <button id="inference-calculate-btn" class="btn-primary">Calculate Inference Memory</button>
+                        <button id="inference-reset-btn" class="btn-secondary">Reset</button>
+                    </div>
+                </div>
+
+                <!-- Inference Results Panel -->
+                <div class="results-panel">
+                    <h2>Inference Results</h2>
+
+                    <div class="result-card">
+                        <h3>Memory Breakdown</h3>
+                        <div class="metric">
+                            <span class="metric-label">Per GPU:</span>
+                            <span class="metric-value" id="inference-result-per-gpu">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Total All GPUs:</span>
+                            <span class="metric-value" id="inference-result-total">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Model Parameters:</span>
+                            <span class="metric-value" id="inference-result-params">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">KV Cache:</span>
+                            <span class="metric-value" id="inference-result-kv-cache">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Activations:</span>
+                            <span class="metric-value" id="inference-result-activations">-- GB</span>
+                        </div>
+                    </div>
+
+                    <div class="result-card">
+                        <h3>Performance Estimates</h3>
+                        <div class="metric">
+                            <span class="metric-label">Max Batch Size:</span>
+                            <span class="metric-value" id="inference-max-batch">--</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Estimated Throughput:</span>
+                            <span class="metric-value" id="inference-throughput">-- tokens/sec</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Fits on GPU:</span>
+                            <span class="metric-value" id="inference-fits">--</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Utilization:</span>
+                            <span class="metric-value" id="inference-utilization">--%</span>
+                        </div>
+                    </div>
+                </div>
+            </div><!-- End Inference Tab -->
+
+            <!-- Multi-Node Tab -->
+            <div id="multinode-tab" class="tab-content" style="display:none;">
+                <div class="config-panel">
+                    <h2>Multi-Node Training Configuration</h2>
+                    <p class="info-text">Calculate network communication overhead for distributed training across multiple nodes.</p>
+
+                    <!-- Model Settings -->
+                    <section class="config-section">
+                        <h3>Model Settings</h3>
+                        <div class="form-group">
+                            <label for="multinode-preset-select">Preset Model:</label>
+                            <select id="multinode-preset-select">
+                                <option value="custom">Custom</option>
+                                <optgroup label="Dense Models">
+                                    <option value="llama2-7b">LLaMA 2 7B</option>
+                                    <option value="llama2-13b">LLaMA 2 13B</option>
+                                    <option value="llama2-70b">LLaMA 2 70B</option>
+                                    <option value="gpt3-175b">GPT-3 175B</option>
+                                </optgroup>
+                                <optgroup label="MoE (Mixture of Experts) Models">
+                                    <option value="glm-4.7-355b">GLM-4.7 355B (MoE) ⭐ Latest</option>
+                                    <option value="glm-4.5-air-106b">GLM-4.5 Air 106B (MoE) ⭐ Air</option>
+                                    <option value="glm-4-9b">GLM-4 9B (MoE)</option>
+                                    <option value="mixtral-8x7b">Mixtral 8x7B (MoE)</option>
+                                    <option value="qwen1.5-moe-a2.7b">Qwen1.5-MoE-A2.7B</option>
+                                    <option value="deepseek-moe-16b">DeepSeek-MoE 16B</option>
+                                </optgroup>
+                            </select>
+                        </div>
+
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="multinode-num-params">Parameters:</label>
+                                <input type="text" id="multinode-num-params" value="7B" placeholder="e.g., 7B">
+                            </div>
+                            <div class="form-group">
+                                <label for="multinode-dtype">Precision:</label>
+                                <select id="multinode-dtype">
+                                    <option value="bf16" selected>BF16</option>
+                                    <option value="fp16">FP16</option>
+                                    <option value="fp32">FP32</option>
+                                </select>
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- Node Configuration -->
+                    <section class="config-section">
+                        <h3>Node Configuration</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="num-nodes">Number of Nodes:</label>
+                                <input type="number" id="num-nodes" value="2" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="gpus-per-node">GPUs per Node:</label>
+                                <input type="number" id="gpus-per-node" value="8" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="interconnect-type" data-tooltip="Network interconnect between nodes">Interconnect Type:</label>
+                                <select id="interconnect-type">
+                                    <option value="infiniband" selected>InfiniBand (200 Gbps)</option>
+                                    <option value="nvlink">NVLink (900 Gbps)</option>
+                                    <option value="ethernet_200g">Ethernet 200G</option>
+                                    <option value="ethernet_100g">Ethernet 100G</option>
+                                    <option value="ethernet_25g">Ethernet 25G</option>
+                                    <option value="ethernet_10g">Ethernet 10G</option>
+                                </select>
+                            </div>
+                        </div>
+                        <p class="info-text">Total GPUs: <span id="multinode-total-gpus">16</span></p>
+                    </section>
+
+                    <!-- Training Configuration -->
+                    <section class="config-section">
+                        <h3>Training Configuration</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="multinode-engine">Training Engine:</label>
+                                <select id="multinode-engine">
+                                    <option value="pytorch_ddp">PyTorch DDP</option>
+                                    <option value="deepspeed" selected>DeepSpeed ZeRO</option>
+                                    <option value="megatron_lm">Megatron-LM</option>
+                                    <option value="fsdp">PyTorch FSDP</option>
+                                </select>
+                            </div>
+                            <div class="form-group" id="multinode-zero-stage-group">
+                                <label for="multinode-zero-stage">ZeRO Stage:</label>
+                                <select id="multinode-zero-stage">
+                                    <option value="1">1: Shard optimizer states</option>
+                                    <option value="2">2: Shard optimizer + gradients</option>
+                                    <option value="3" selected>3: Shard everything</option>
+                                </select>
+                            </div>
+                            <div class="form-group">
+                                <label for="multinode-batch-size">Batch Size per GPU:</label>
+                                <input type="number" id="multinode-batch-size" value="4" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="multinode-seq-len">Sequence Length:</label>
+                                <input type="number" id="multinode-seq-len" value="4096" min="1">
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- Parallelism Settings -->
+                    <section class="config-section">
+                        <h3>Parallelism Strategy</h3>
+                        <div class="form-grid">
+                            <div class="form-group">
+                                <label for="multinode-tensor-pp">Tensor Parallel:</label>
+                                <input type="number" id="multinode-tensor-pp" value="1" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="multinode-pipeline-pp">Pipeline Parallel:</label>
+                                <input type="number" id="multinode-pipeline-pp" value="1" min="1">
+                            </div>
+                            <div class="form-group">
+                                <label for="multinode-seq-parallel">
+                                    <input type="checkbox" id="multinode-seq-parallel">
+                                    Enable Sequence Parallel
+                                </label>
+                            </div>
+                            <div class="form-group">
+                                <label for="multinode-optimize">
+                                    <input type="checkbox" id="multinode-optimize" checked>
+                                    Auto-optimize Strategy
+                                </label>
+                            </div>
+                        </div>
+                    </section>
+
+                    <!-- Calculate Button -->
+                    <div class="button-group">
+                        <button id="multinode-calculate-btn" class="btn-primary">Calculate Network Overhead</button>
+                        <button id="multinode-reset-btn" class="btn-secondary">Reset</button>
+                    </div>
+                </div>
+
+                <!-- Multi-Node Results Panel -->
+                <div class="results-panel">
+                    <h2>Multi-Node Results</h2>
+
+                    <div class="result-card">
+                        <h3>Network Overhead</h3>
+                        <div class="metric">
+                            <span class="metric-label">Total Overhead:</span>
+                            <span class="metric-value" id="multinode-overhead-total">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">AllReduce:</span>
+                            <span class="metric-value" id="multinode-overhead-allreduce">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">AllGather:</span>
+                            <span class="metric-value" id="multinode-overhead-allgather">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">ReduceScatter:</span>
+                            <span class="metric-value" id="multinode-overhead-reducescatter">-- GB</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Pipeline Comm:</span>
+                            <span class="metric-value" id="multinode-overhead-pipeline">-- GB</span>
+                        </div>
+                    </div>
+
+                    <div class="result-card">
+                        <h3>Time Impact</h3>
+                        <div class="metric">
+                            <span class="metric-label">Est. Overhead:</span>
+                            <span class="metric-value" id="multinode-time-overhead">-- ms/step</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Communication Time:</span>
+                            <span class="metric-value" id="multinode-comm-time">-- ms/step</span>
+                        </div>
+                        <div class="metric">
+                            <span class="metric-label">Latency Impact:</span>
+                            <span class="metric-value" id="multinode-latency">-- ms</span>
+                        </div>
+                    </div>
+
+                    <div class="result-card">
+                        <h3>Optimization Suggestions</h3>
+                        <div id="multinode-suggestions">
+                            <p>Run calculation to see optimization suggestions.</p>
+                        </div>
+                    </div>
+                </div>
+            </div><!-- End Multi-Node Tab -->
+
+        </div>
+
+        <div id="error-message" class="error-message" style="display:none"></div>
+    </div>
+
+    <script src="/static/js/app.js"></script>
+</body>
+</html>