Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import json | |
| from typing import Dict, Tuple, List | |
| # Model specifications (approximate parameter counts and memory requirements) | |
| MODEL_SPECS = { | |
| "LLaMA-2-7B": {"params": 7e9, "base_memory_gb": 14}, | |
| "LLaMA-2-13B": {"params": 13e9, "base_memory_gb": 26}, | |
| "LLaMA-2-70B": {"params": 70e9, "base_memory_gb": 140}, | |
| "LLaMA-3-8B": {"params": 8e9, "base_memory_gb": 16}, | |
| "LLaMA-3-70B": {"params": 70e9, "base_memory_gb": 140}, | |
| "LLaMA-3.1-8B": {"params": 8e9, "base_memory_gb": 16}, | |
| "LLaMA-3.1-70B": {"params": 70e9, "base_memory_gb": 140}, | |
| "LLaMA-3.1-405B": {"params": 405e9, "base_memory_gb": 810}, | |
| "Nemotron-4-340B": {"params": 340e9, "base_memory_gb": 680}, | |
| "Nemotron-4-15B": {"params": 15e9, "base_memory_gb": 30}, | |
| "Qwen2-0.5B": {"params": 0.5e9, "base_memory_gb": 1}, | |
| "Qwen2-1.5B": {"params": 1.5e9, "base_memory_gb": 3}, | |
| "Qwen2-7B": {"params": 7e9, "base_memory_gb": 14}, | |
| "Qwen2-72B": {"params": 72e9, "base_memory_gb": 144}, | |
| "Qwen2.5-0.5B": {"params": 0.5e9, "base_memory_gb": 1}, | |
| "Qwen2.5-1.5B": {"params": 1.5e9, "base_memory_gb": 3}, | |
| "Qwen2.5-7B": {"params": 7e9, "base_memory_gb": 14}, | |
| "Qwen2.5-14B": {"params": 14e9, "base_memory_gb": 28}, | |
| "Qwen2.5-32B": {"params": 32e9, "base_memory_gb": 64}, | |
| "Qwen2.5-72B": {"params": 72e9, "base_memory_gb": 144}, | |
| # Qwen Vision Language Models | |
| "Qwen-VL": {"params": 9.6e9, "base_memory_gb": 20}, | |
| "Qwen-VL-Chat": {"params": 9.6e9, "base_memory_gb": 20}, | |
| "Qwen-VL-Plus": {"params": 12e9, "base_memory_gb": 25}, | |
| "Qwen-VL-Max": {"params": 30e9, "base_memory_gb": 65}, | |
| "Qwen2-VL-2B": {"params": 2e9, "base_memory_gb": 5}, | |
| "Qwen2-VL-7B": {"params": 8e9, "base_memory_gb": 18}, | |
| "Qwen2-VL-72B": {"params": 72e9, "base_memory_gb": 150}, | |
| # NVIDIA VILA Series | |
| "VILA-1.5-3B": {"params": 3e9, "base_memory_gb": 7}, | |
| "VILA-1.5-8B": {"params": 8e9, "base_memory_gb": 18}, | |
| "VILA-1.5-13B": {"params": 13e9, "base_memory_gb": 28}, | |
| "VILA-1.5-40B": {"params": 40e9, "base_memory_gb": 85}, | |
| # Qwen Audio Models | |
| "Qwen-Audio": {"params": 8e9, "base_memory_gb": 18}, | |
| "Qwen-Audio-Chat": {"params": 8e9, "base_memory_gb": 18}, | |
| "Qwen2-Audio-7B": {"params": 8e9, "base_memory_gb": 18}, | |
| # NVIDIA PhysicsNeMo Models | |
| "PhysicsNeMo-FNO-Small": {"params": 1e6, "base_memory_gb": 0.5}, | |
| "PhysicsNeMo-FNO-Medium": {"params": 10e6, "base_memory_gb": 2}, | |
| "PhysicsNeMo-FNO-Large": {"params": 50e6, "base_memory_gb": 8}, | |
| "PhysicsNeMo-PINN-Small": {"params": 0.5e6, "base_memory_gb": 0.2}, | |
| "PhysicsNeMo-PINN-Medium": {"params": 5e6, "base_memory_gb": 1}, | |
| "PhysicsNeMo-PINN-Large": {"params": 20e6, "base_memory_gb": 4}, | |
| "PhysicsNeMo-GraphCast-Small": {"params": 50e6, "base_memory_gb": 8}, | |
| "PhysicsNeMo-GraphCast-Medium": {"params": 200e6, "base_memory_gb": 20}, | |
| "PhysicsNeMo-GraphCast-Large": {"params": 1e9, "base_memory_gb": 50}, | |
| "PhysicsNeMo-SFNO-Small": {"params": 25e6, "base_memory_gb": 5}, | |
| "PhysicsNeMo-SFNO-Medium": {"params": 100e6, "base_memory_gb": 15}, | |
| "PhysicsNeMo-SFNO-Large": {"params": 500e6, "base_memory_gb": 35}, | |
| } | |
| # H100 specifications | |
| H100_MEMORY_GB = 80 # Memory per GPU | |
| H100_GPUS_PER_NODE = 8 # GPUs per node | |
| H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE # 640GB per node | |
| H100_COMPUTE_CAPABILITY = "9.0" | |
| # CUDA version recommendations based on model and use case | |
| CUDA_RECOMMENDATIONS = { | |
| "inference": { | |
| "recommended": "12.1+", | |
| "minimum": "11.8", | |
| "optimal": "12.4" | |
| }, | |
| "training": { | |
| "recommended": "12.1+", | |
| "minimum": "11.8", | |
| "optimal": "12.4" | |
| }, | |
| "fine_tuning": { | |
| "recommended": "12.1+", | |
| "minimum": "11.8", | |
| "optimal": "12.4" | |
| } | |
| } | |
| def calculate_kv_cache_memory(num_tokens: int, model_params: float, num_layers: int = None) -> float: | |
| """Calculate KV cache memory requirements in GB""" | |
| if num_layers is None: | |
| # Estimate layers based on model size | |
| if model_params < 1e9: | |
| num_layers = 24 | |
| elif model_params < 10e9: | |
| num_layers = 32 | |
| elif model_params < 100e9: | |
| num_layers = 80 | |
| else: | |
| num_layers = 96 | |
| # KV cache memory per token (approximate) | |
| # 2 (K + V) * 2 (fp16) * hidden_dim * num_layers | |
| hidden_dim = int((model_params / (num_layers * 4)) ** 0.5) * 64 # Rough estimate | |
| kv_memory_per_token = 2 * 2 * hidden_dim * num_layers / (1024**3) # GB | |
| return num_tokens * kv_memory_per_token | |
| def estimate_h100_nodes( | |
| model_name: str, | |
| input_tokens: int, | |
| output_tokens: int, | |
| batch_size: int, | |
| use_case: str, | |
| precision: str | |
| ) -> Tuple[int, str, Dict]: | |
| """ | |
| Estimate the number of H100 nodes required | |
| Returns: | |
| - Number of nodes required | |
| - Detailed explanation | |
| - Dictionary with breakdown | |
| """ | |
| if model_name not in MODEL_SPECS: | |
| return 1, f"Model {model_name} not found in specifications", {} | |
| model_spec = MODEL_SPECS[model_name] | |
| base_memory = model_spec["base_memory_gb"] | |
| # Adjust memory based on precision | |
| precision_multiplier = { | |
| "FP32": 1.0, | |
| "FP16": 0.5, | |
| "BF16": 0.5, | |
| "INT8": 0.25, | |
| "INT4": 0.125 | |
| } | |
| model_memory = base_memory * precision_multiplier.get(precision, 0.5) | |
| # Calculate KV cache memory | |
| total_tokens = input_tokens + output_tokens | |
| kv_cache_memory = calculate_kv_cache_memory(total_tokens, model_spec["params"]) * batch_size | |
| # Use case specific memory overhead | |
| overhead_multiplier = { | |
| "inference": 1.2, # 20% overhead | |
| "training": 3.0, # 3x for gradients, optimizer states | |
| "fine_tuning": 2.5 # 2.5x for fine-tuning | |
| } | |
| total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2) | |
| # Calculate nodes needed | |
| memory_per_node = H100_NODE_MEMORY_GB * 0.9 # Reserve 10% for system (576GB usable per node) | |
| nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node))) | |
| # For very large models, consider model parallelism | |
| if model_memory > memory_per_node: | |
| min_nodes_for_model = int(np.ceil(model_memory / memory_per_node)) | |
| nodes_needed = max(nodes_needed, min_nodes_for_model) | |
| # Generate explanation | |
| explanation = f""" | |
| **Estimation Breakdown:** | |
| • **Model**: {model_name} ({model_spec['params']/1e9:.1f}B parameters) | |
| • **Precision**: {precision} | |
| • **Model Memory**: {model_memory:.1f} GB | |
| • **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size) | |
| • **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case}) | |
| • **Total Memory Required**: {total_memory_per_instance:.1f} GB | |
| • **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node | |
| • **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved) | |
| **Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total) | |
| """ | |
| breakdown = { | |
| "model_memory_gb": model_memory, | |
| "kv_cache_memory_gb": kv_cache_memory, | |
| "total_memory_gb": total_memory_per_instance, | |
| "h100_memory_per_node_gb": memory_per_node, | |
| "nodes_required": nodes_needed | |
| } | |
| return nodes_needed, explanation, breakdown | |
| def get_cuda_recommendation(use_case: str) -> str: | |
| """Get CUDA version recommendation based on use case""" | |
| cuda_info = CUDA_RECOMMENDATIONS.get(use_case, CUDA_RECOMMENDATIONS["inference"]) | |
| recommendation = f""" | |
| **CUDA Version Recommendations for {use_case.title()}:** | |
| • **Optimal**: CUDA {cuda_info['optimal']} + cuDNN 8.9+ | |
| • **Recommended**: CUDA {cuda_info['recommended']} + cuDNN 8.7+ | |
| • **Minimum**: CUDA {cuda_info['minimum']} + cuDNN 8.5+ | |
| **Additional Requirements:** | |
| • **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows) | |
| • **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native) | |
| • **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total) | |
| • **Memory**: ECC enabled recommended for production | |
| """ | |
| return recommendation | |
| def create_performance_chart(breakdown: Dict) -> plt.Figure: | |
| """Create a memory utilization chart""" | |
| if not breakdown: | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| ax.text(0.5, 0.5, 'No data to display', ha='center', va='center') | |
| ax.set_xlim(0, 1) | |
| ax.set_ylim(0, 1) | |
| return fig | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) | |
| # Memory breakdown pie chart | |
| labels = ['Model Memory', 'KV Cache', 'Overhead'] | |
| model_mem = breakdown['model_memory_gb'] | |
| kv_mem = breakdown['kv_cache_memory_gb'] | |
| overhead_mem = breakdown['total_memory_gb'] - model_mem - kv_mem | |
| sizes = [model_mem, kv_mem, overhead_mem] | |
| colors = ['#ff9999', '#66b3ff', '#99ff99'] | |
| ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) | |
| ax1.set_title('Memory Breakdown') | |
| # Node utilization bar chart | |
| nodes = breakdown['nodes_required'] | |
| total_memory = breakdown['total_memory_gb'] | |
| memory_per_node = breakdown['h100_memory_per_node_gb'] | |
| node_labels = [f'Node {i+1}' for i in range(nodes)] | |
| utilization = [] | |
| for i in range(nodes): | |
| if i < nodes - 1: | |
| utilization.append(memory_per_node) | |
| else: | |
| remaining_memory = total_memory - (nodes - 1) * memory_per_node | |
| utilization.append(remaining_memory) | |
| utilization_pct = [u / memory_per_node * 100 for u in utilization] | |
| bars = ax2.bar(node_labels, utilization_pct, color='skyblue', alpha=0.7) | |
| ax2.axhline(y=100, color='red', linestyle='--', alpha=0.7, label='Max Capacity') | |
| ax2.set_ylabel('Memory Utilization (%)') | |
| ax2.set_title('H100 Node Memory Utilization') | |
| ax2.set_ylim(0, 110) | |
| ax2.legend() | |
| # Add value labels on bars | |
| for bar, pct in zip(bars, utilization_pct): | |
| ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, | |
| f'{pct:.1f}%', ha='center', va='bottom') | |
| plt.tight_layout() | |
| return fig | |
| def estimate_nodes_interface( | |
| model_name: str, | |
| input_tokens: int, | |
| output_tokens: int, | |
| batch_size: int, | |
| use_case: str, | |
| precision: str | |
| ): | |
| """Main interface function""" | |
| # Validate inputs | |
| if input_tokens <= 0 or output_tokens <= 0: | |
| return "Please enter valid token counts (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Token counts must be > 0**</span>" | |
| if batch_size <= 0: | |
| return "Please enter a valid batch size (> 0)", "", None, "## ⚠️ <span style='color: #E74C3C;'>**Invalid Input: Batch size must be > 0**</span>" | |
| # Calculate node requirements | |
| nodes_needed, explanation, breakdown = estimate_h100_nodes( | |
| model_name, input_tokens, output_tokens, batch_size, use_case, precision | |
| ) | |
| # Get CUDA recommendations | |
| cuda_rec = get_cuda_recommendation(use_case) | |
| # Create performance chart | |
| fig = create_performance_chart(breakdown) | |
| return explanation, cuda_rec, fig, f"## 🖥️ <span style='color: #4A90E2;'>**Estimated H100 Nodes Required: {nodes_needed}**</span>" | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks(title="H100 Node Estimator", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🚀 H100 Node & CUDA Version Estimator") | |
| gr.Markdown("Get recommendations for H100 node count and CUDA version based on your model and workload requirements.") | |
| gr.Markdown("**Comprehensive Model Support**: LLaMA, Nemotron, Qwen2/2.5, Qwen-VL, VILA, Qwen-Audio, and **NVIDIA PhysicsNeMo** series!") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("## Input Parameters") | |
| model_dropdown = gr.Dropdown( | |
| choices=list(MODEL_SPECS.keys()), | |
| value="LLaMA-3-8B", | |
| label="Model", | |
| info="Select the model you want to run (includes LLMs, multimodal, and physics-ML models)" | |
| ) | |
| input_tokens = gr.Number( | |
| value=2048, | |
| label="Input Tokens", | |
| info="Number of input tokens per request" | |
| ) | |
| output_tokens = gr.Number( | |
| value=512, | |
| label="Output Tokens", | |
| info="Number of output tokens per request" | |
| ) | |
| batch_size = gr.Number( | |
| value=1, | |
| label="Batch Size", | |
| info="Number of concurrent requests" | |
| ) | |
| use_case = gr.Dropdown( | |
| choices=["inference", "training", "fine_tuning"], | |
| value="inference", | |
| label="Use Case", | |
| info="What will you use the model for?" | |
| ) | |
| precision = gr.Dropdown( | |
| choices=["FP32", "FP16", "BF16", "INT8", "INT4"], | |
| value="FP16", | |
| label="Precision", | |
| info="Model precision/quantization" | |
| ) | |
| estimate_btn = gr.Button("💡 Estimate Requirements", variant="primary") | |
| with gr.Column(scale=2): | |
| gr.Markdown("## Results") | |
| node_count = gr.Markdown("## 🖥️ <span style='color: #4A90E2;'>**Ready to estimate...**</span>") | |
| with gr.Tab("📊 Detailed Analysis"): | |
| detailed_output = gr.Markdown() | |
| with gr.Tab("🔧 CUDA Recommendations"): | |
| cuda_output = gr.Markdown() | |
| with gr.Tab("📈 Memory Utilization"): | |
| chart_output = gr.Plot() | |
| # Connect the interface | |
| estimate_btn.click( | |
| fn=estimate_nodes_interface, | |
| inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision], | |
| outputs=[detailed_output, cuda_output, chart_output, node_count] | |
| ) | |
| # Add examples | |
| gr.Markdown("## 💡 Example Scenarios") | |
| examples = [ | |
| ["LLaMA-3-8B", 2048, 512, 1, "inference", "FP16"], | |
| ["LLaMA-3-70B", 4096, 1024, 4, "inference", "FP16"], | |
| ["Qwen2.5-72B", 8192, 2048, 2, "fine_tuning", "BF16"], | |
| ["Nemotron-4-340B", 2048, 1024, 1, "inference", "INT8"], | |
| ["Qwen2-VL-7B", 1024, 256, 1, "inference", "FP16"], | |
| ["VILA-1.5-13B", 2048, 512, 2, "inference", "BF16"], | |
| ["Qwen2-Audio-7B", 1024, 256, 1, "inference", "FP16"], | |
| ["PhysicsNeMo-FNO-Large", 512, 128, 8, "training", "FP32"], | |
| ["PhysicsNeMo-GraphCast-Medium", 1024, 256, 4, "training", "FP16"], | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision], | |
| outputs=[detailed_output, cuda_output, chart_output, node_count], | |
| fn=estimate_nodes_interface, | |
| cache_examples=False | |
| ) | |
| gr.Markdown(""" | |
| ## ℹ️ Notes | |
| - **Multimodal Models**: Vision-language and audio models may require additional memory for image/audio processing | |
| - **PhysicsNeMo Models**: Physics-ML models (FNO, PINN, GraphCast, SFNO) typically require higher batch sizes for training | |
| - **Token Estimation**: For multimodal models, consider image patches (~256-1024 tokens per image) and audio frames | |
| - **Physics Simulations**: PhysicsNeMo models often work with spatial/temporal grids rather than tokens | |
| - Estimates are approximate and may vary based on actual implementation details | |
| - Memory calculations include model weights, KV cache, and operational overhead | |
| - Consider network bandwidth and storage requirements for multi-node setups | |
| - For production deployments, add 10-20% buffer for optimal performance | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(share=True, server_name="0.0.0.0", server_port=7860) |