Spaces:

MCP-1st-Birthday
/

Hivenet_ComputeAgent

Sleeping

App Files Files Community

Hivenet_ComputeAgent / ComputeAgent /nodes /ReAct_DeployModel /capacity_estimation.py

carraraig

Hello

8816dfd 3 months ago

raw

history blame contribute delete

14.7 kB

	"""
	Capacity Estimation Node

	This node handles the estimation of compute capacity requirements for model deployment.
	Currently minimal implementation - placeholder for future capacity estimation logic.

	Key Features:
	- Compute capacity estimation (placeholder)
	- Resource requirement analysis (placeholder)
	- State management for workflow

	Author: ComputeAgent Team
	License: Private
	"""

	import logging
	import math
	from typing import Dict, Any

	logger = logging.getLogger("CapacityEstimation")

	# Mapping dtype to factor (bytes per parameter)
	DTYPE_FACTOR = {
	# Standard PyTorch dtypes
	"auto": 2,
	"half": 2,
	"float16": 2,
	"fp16": 2,
	"bfloat16": 2,
	"bf16": 2,
	"float": 4,
	"float32": 4,
	"fp32": 4,
	# Quantized dtypes
	"fp8": 1,
	"fp8_e4m3": 1,
	"fp8_e5m2": 1,
	"f8_e4m3": 1, # HuggingFace naming convention
	"f8_e5m2": 1,
	"int8": 1,
	"int4": 0.5,
	}

	KV_CACHE_DTYPE_FACTOR = {
	"auto": None, # Will be set to model dtype factor
	"float32": 4,
	"fp32": 4,
	"float16": 2,
	"fp16": 2,
	"bfloat16": 2,
	"bf16": 2,
	"fp8": 1,
	"fp8_e5m2": 1,
	"fp8_e4m3": 1,
	"f8_e4m3": 1, # HuggingFace naming convention
	"f8_e5m2": 1,
	"int8": 1,
	}

	# GPU specifications (in GB)
	GPU_SPECS = {
	"RTX 4090": 24,
	"RTX 5090": 32,
	}

	# GPU pricing (in EUR per hour)
	GPU_PRICING = {
	"RTX 4090": 0.2,
	"RTX 5090": 0.4,
	}

	def normalize_dtype(dtype: str) -> str:
	"""
	Normalize dtype string to a canonical form for consistent lookup.

	Args:
	dtype: Raw dtype string (e.g., "F8_E4M3", "BF16", "float16")

	Returns:
	Normalized dtype string in lowercase with underscores
	"""
	if not dtype:
	return "auto"

	# Convert to lowercase and handle common variations
	normalized = dtype.lower()

	# Handle HuggingFace safetensors naming conventions
	# F8_E4M3 -> f8_e4m3, BF16 -> bf16, etc.
	return normalized


	def get_dtype_factor(dtype: str, default: int = 2) -> float:
	"""
	Get the bytes-per-parameter factor for a given dtype.

	Args:
	dtype: Data type string
	default: Default factor if dtype not found

	Returns:
	Factor (bytes per parameter)
	"""
	normalized = normalize_dtype(dtype)
	return DTYPE_FACTOR.get(normalized, default)

	def estimate_vllm_gpu_memory(
	num_params: int,
	dtype: str = "auto",
	num_hidden_layers: int = None,
	hidden_size: int = None,
	intermediate_size: int = None,
	num_key_value_heads: int = None,
	head_dim: int = None,
	max_model_len: int = 2048,
	max_num_seqs: int = 256,
	max_num_batched_tokens: int = 2048,
	kv_cache_dtype: str = "auto",
	gpu_memory_utilization: float = 0.9,
	cpu_offload_gb: float = 0.0,
	is_quantized: bool = None # NEW: indicate if num_params is already quantized
	) -> float:
	"""
	Estimate GPU memory for a model. Handles:
	1. Full parameter info -> detailed estimation
	2. Only num_params and dtype -> rough estimation
	Returns memory in GB

	Args:
	num_params: Number of parameters. For quantized models from HF API,
	this is already in the quantized format.
	is_quantized: If True, num_params represents quantized size.
	If None, auto-detect from dtype.
	"""
	constant_margin = 1.5

	dtype_factor = get_dtype_factor(dtype, default=2)

	# Auto-detect if model is quantized
	if is_quantized is None:
	quantized_dtypes = ["fp8", "f8_e4m3", "f8_e5m2", "int8", "int4", "fp8_e4m3", "fp8_e5m2"]
	is_quantized = normalize_dtype(dtype) in quantized_dtypes

	# Case 1: Only num_params available (simplified)
	if None in [num_hidden_layers, hidden_size, intermediate_size, num_key_value_heads, head_dim]:
	if is_quantized:
	# num_params already represents quantized size
	# HF API returns parameter count in the quantized dtype
	# So we DON'T multiply by dtype_factor again
	model_weight = num_params / 1e9 # Already accounts for quantization
	else:
	# For non-quantized models, calculate weight from params
	model_weight = (num_params * dtype_factor) / 1e9

	# Rough activation estimate (typically FP16 regardless of weight dtype)
	# Activation memory is roughly 1-2x model weight for transformer models
	activation_estimate = model_weight * 1.5

	estimated_gpu_memory = (model_weight + activation_estimate + constant_margin) / gpu_memory_utilization - cpu_offload_gb
	return estimated_gpu_memory

	# Case 2: Full info available -> detailed vLLM formula
	if is_quantized:
	model_weight = num_params / 1e9
	else:
	model_weight = (num_params * dtype_factor) / 1e9

	if kv_cache_dtype == "auto":
	# For quantized models, KV cache often uses FP16/BF16, not FP8
	kv_cache_dtype_factor = 2 if is_quantized else dtype_factor
	else:
	normalized_kv = normalize_dtype(kv_cache_dtype)
	kv_cache_dtype_factor = KV_CACHE_DTYPE_FACTOR.get(normalized_kv, 2)

	per_seq_kv_cache_memory = (2 * num_key_value_heads * head_dim * num_hidden_layers *
	kv_cache_dtype_factor * max_model_len) / 1e9

	total_kv_cache_memory = min(
	per_seq_kv_cache_memory * max_num_seqs,
	(2 * num_hidden_layers * hidden_size * kv_cache_dtype_factor * max_num_batched_tokens) / 1e9
	)

	# Activations are typically FP16/BF16 even for quantized models
	activation_dtype_factor = 2 # Assume FP16 activations
	activation_peak_memory = max_model_len * ((18 * hidden_size) + (4 * intermediate_size)) * activation_dtype_factor / 1e9

	required_gpu_memory = (model_weight + total_kv_cache_memory + activation_peak_memory + constant_margin) / gpu_memory_utilization - cpu_offload_gb

	return required_gpu_memory


	def calculate_gpu_requirements(estimated_memory_gb: float) -> Dict[str, Any]:
	"""
	Calculate number of GPUs needed and costs for different GPU types.

	Args:
	estimated_memory_gb: Estimated GPU memory requirement in GB

	Returns:
	Dictionary containing GPU requirements and cost information
	"""
	gpu_requirements = {}
	cost_estimates = {}

	for gpu_type, gpu_memory in GPU_SPECS.items():
	# Account for ~10% overhead for communication and fragmentation in multi-GPU setup
	usable_memory = gpu_memory * 0.9
	num_gpus = math.ceil(estimated_memory_gb / usable_memory)

	# Calculate costs
	hourly_cost = num_gpus * GPU_PRICING[gpu_type]
	daily_cost = hourly_cost * 24
	weekly_cost = hourly_cost * 24 * 7

	gpu_requirements[gpu_type] = num_gpus
	cost_estimates[gpu_type] = {
	"hourly": hourly_cost,
	"daily": daily_cost,
	"weekly": weekly_cost
	}

	return {
	"gpu_requirements": gpu_requirements,
	"cost_estimates": cost_estimates
	}


	async def capacity_estimation_node(state: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Estimate GPU memory for a model deployment using vLLM-based computation.
	Handles both initial estimation and re-estimation with custom inference config.
	"""
	# Check if this is a re-estimation
	is_re_estimation = state.get("needs_re_estimation", False)
	if is_re_estimation:
	logger.info("🔄 Starting capacity re-estimation with custom inference configuration")
	# Reset the re-estimation flag
	state["needs_re_estimation"] = False
	state["capacity_approved"] = False
	else:
	logger.info("⚡ Starting capacity estimation node")

	try:
	model_name = state.get("model_name")
	model_info = state.get("model_info")

	if not model_name or not model_info:
	logger.error("❌ Missing model information")
	state["capacity_estimation_status"] = "error"
	state["error"] = "Model information required for capacity estimation"
	return state

	# Extract safetensors info
	dtype = model_info.get("dtype", "auto")
	num_params = model_info.get("num_params", None)

	# Extract required parameters for GPU memory estimation
	params = {
	"num_params": num_params,
	"dtype": dtype,
	"num_hidden_layers": model_info.get("num_hidden_layers"),
	"hidden_size": model_info.get("hidden_size"),
	"intermediate_size": model_info.get("intermediate_size"),
	"num_key_value_heads": model_info.get("num_key_value_heads"),
	"head_dim": model_info.get("head_dim"),
	"max_model_len": model_info.get("max_model_len", 2048),
	"max_num_seqs": model_info.get("max_num_seqs", 256),
	"max_num_batched_tokens": model_info.get("max_num_batched_tokens", 2048),
	"kv_cache_dtype": model_info.get("kv_cache_dtype", "auto"),
	"gpu_memory_utilization": model_info.get("gpu_memory_utilization", 0.9),
	"cpu_offload_gb": model_info.get("cpu_offload_gb", 0.0)
	}

	estimated_gpu_memory = estimate_vllm_gpu_memory(**params)

	# Calculate GPU requirements and costs
	gpu_data = calculate_gpu_requirements(estimated_gpu_memory)
	gpu_requirements = gpu_data["gpu_requirements"]
	cost_estimates = gpu_data["cost_estimates"]

	# Store in state
	state["estimated_gpu_memory"] = estimated_gpu_memory
	state["gpu_requirements"] = gpu_requirements
	state["cost_estimates"] = cost_estimates
	state["capacity_estimation_status"] = "success"

	# Build comprehensive response
	model_size_b = num_params / 1e9 if num_params else "Unknown"

	# Model architecture details
	architecture_info = []
	if model_info.get("num_hidden_layers"):
	architecture_info.append(f"Layers: {model_info['num_hidden_layers']}")
	if model_info.get("hidden_size"):
	architecture_info.append(f"Hidden Size: {model_info['hidden_size']}")
	if model_info.get("num_attention_heads"):
	architecture_info.append(f"Attention Heads: {model_info['num_attention_heads']}")
	if model_info.get("num_key_value_heads"):
	architecture_info.append(f"KV Heads: {model_info['num_key_value_heads']}")
	if model_info.get("intermediate_size"):
	architecture_info.append(f"Intermediate Size: {model_info['intermediate_size']}")
	if model_info.get("max_position_embeddings"):
	architecture_info.append(f"Max Position Embeddings: {model_info['max_position_embeddings']}")

	architecture_section = "\n ".join(architecture_info) if architecture_info else "Limited architecture information available"

	# Inference configuration
	inference_config = f"""Max Model Length: {params['max_model_len']}
	Max Sequences: {params['max_num_seqs']}
	Max Batched Tokens: {params['max_num_batched_tokens']}
	KV Cache dtype: {params['kv_cache_dtype']}
	GPU Memory Utilization: {params['gpu_memory_utilization']*100:.0f}%"""

	# GPU requirements and cost section
	gpu_req_lines = []
	cost_lines = []

	# Highlight RTX 4090 and 5090
	for gpu_type in ["RTX 4090", "RTX 5090"]:
	if gpu_type in gpu_requirements:
	num_gpus = gpu_requirements[gpu_type]
	gpu_memory = GPU_SPECS[gpu_type]
	costs = cost_estimates[gpu_type]

	gpu_req_lines.append(f"{gpu_type} ({gpu_memory}GB): {num_gpus} GPU{'s' if num_gpus > 1 else ''}")
	cost_lines.append(f"{gpu_type}: €{costs['hourly']:.2f}/hour \| €{costs['daily']:.2f}/day \| €{costs['weekly']:.2f}/week")

	gpu_requirements_section = "\n ".join(gpu_req_lines)
	cost_section = "\n ".join(cost_lines)

	# Build final response
	estimation_title = "Capacity Re-Estimation Complete" if is_re_estimation else "Capacity Estimation Complete"
	custom_note = "Note: Re-estimated with custom inference configuration. " if is_re_estimation else "Note: "

	GPU_type = state['custom_inference_config']['GPU_type'] if is_re_estimation else model_info.get('GPU_type', 'RTX 4090')
	location = state['custom_inference_config']['location'] if is_re_estimation else model_info.get('location', 'UAE-1')

	state["response"] = f"""
	{estimation_title}

	Model Information:
	Name: {model_name}
	Parameters: {model_size_b:.2f}B
	Data Type: {dtype}

	Architecture Details:
	{architecture_section}

	Inference Configuration:
	{inference_config}

	Estimated GPU Memory Required: {estimated_gpu_memory:.2f} GB

	GPU Requirements:
	{gpu_requirements_section}

	Cost Estimates:
	{cost_section}

	Selected GPU Type: {GPU_type}
	Deployment Location: {location}

	{custom_note}This estimation includes model weights, KV cache, activation peak, and a safety margin. Multi-GPU setups account for ~10% overhead for communication.*"""

	logger.info(f"✅ Estimated GPU memory: {estimated_gpu_memory:.2f} GB")
	logger.info(f"📊 GPU Requirements: RTX 4090: {gpu_requirements.get('RTX 4090', 'N/A')}, RTX 5090: {gpu_requirements.get('RTX 5090', 'N/A')}")

	# Prepare state for human approval - set pending capacity approval
	state["pending_capacity_approval"] = True
	state["needs_re_estimation"] = False # Reset flag after processing
	state["current_step"] = "capacity_estimation_complete"

	except Exception as e:
	logger.error(f"❌ Error in capacity estimation: {str(e)}")
	state["capacity_estimation_status"] = "error"
	state["error"] = str(e)
	state["response"] = f"""❌ Capacity Estimation Failed

	Model: {state.get('model_name', 'Unknown')}
	Error: {str(e)}

	Please check if:
	1. The model exists on HuggingFace
	2. You have access to the model (if it's gated)
	3. Your HuggingFace token is valid"""

	return state