import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import math from plotly.subplots import make_subplots import json from datetime import datetime # Configure Streamlit for better performance st.set_page_config( page_title="Katonic Multitenant Infrastructure Calculator", page_icon="🚀", layout="wide", initial_sidebar_state="expanded" ) # Cloud provider and On-Premise pricing data (per hour in USD) CLOUD_PRICING = { 'On-Premise': { 'name': 'On-Premise Datacenter', 'cost_per_node_hour': 0.192, # ~50% of cloud (amortized hardware + power + cooling over 3 years) 'managed_k8s_cost': 0.05, # Self-managed K8s operational cost (admin time, monitoring tools) 'description': 'Dell PowerEdge R640 / HPE DL360 equivalent', 'specs': '8 vCPUs, 32GB RAM', 'vectordb_node': { 'instance_type': 'Dell PowerEdge R740 / HPE DL380 equivalent', 'cost_per_hour': 0.384, # ~50% of cloud (high-memory server amortized) 'specs': '16 vCPUs, 64GB RAM' }, 'jump_host': { 'instance_type': 'Dell PowerEdge R440 / HPE DL20 equivalent', 'cost_per_hour': 0.048, # ~50% of cloud (small server amortized) 'specs': '2 vCPUs, 8GB RAM' }, 'additional_services': { 'Network_Infrastructure': {'cost_per_hour': 0.020, 'description': 'Switches, routers, firewalls (amortized)'}, 'Storage_SAN': {'cost_per_gb_month': 0.05, 'description': 'SAN/NAS storage (1TB base, amortized)'}, 'Hardware_Load_Balancer': {'cost_per_hour': 0.010, 'description': 'F5/Citrix ADC (amortized)'}, 'Power_Cooling': {'cost_per_hour': 0.030, 'description': 'Datacenter power (0.1kW/server) and cooling'}, 'Datacenter_Space': {'cost_per_hour': 0.015, 'description': 'Rack space and facilities costs'}, 'Maintenance_Support': {'cost_per_hour': 0.025, 'description': 'Hardware maintenance and vendor support contracts'} }, 'gpu_pricing_multiplier': 0.55, # On-prem GPU costs are ~55% of cloud (hardware amortization + power) 'notes': 'Costs include: hardware amortization (3-year lifecycle), power (~$0.10/kWh), cooling (1:1 ratio), rack space, network infrastructure, storage, and maintenance. Assumes enterprise datacenter with N+1 redundancy. Does NOT include: initial capex, datacenter construction, staff salaries (covered in K8s management cost).' }, 'AWS': { 'name': 'Amazon EKS', 'cost_per_node_hour': 0.384, 'managed_k8s_cost': 0.10, 'description': 'm5.2xlarge instances', 'specs': '8 vCPUs, 32GB RAM', 'vectordb_node': { 'instance_type': 'm5.4xlarge', 'cost_per_hour': 0.768, 'specs': '16 vCPUs, 64GB RAM' }, 'jump_host': { 'instance_type': 'm5.large', 'cost_per_hour': 0.096, 'specs': '2 vCPUs, 8GB RAM' }, 'additional_services': { 'VPC': {'cost_per_hour': 0.0, 'description': 'Virtual Private Cloud (Free)'}, 'EBS': {'cost_per_gb_month': 0.10, 'description': 'Elastic Block Store (1TB expandable)'}, 'ELB': {'cost_per_hour': 0.025, 'description': 'Elastic Load Balancer'}, 'EIP': {'cost_per_hour': 0.005, 'description': 'Elastic IP Address'} } }, 'Azure': { 'name': 'Azure Kubernetes Service', 'cost_per_node_hour': 0.384, 'managed_k8s_cost': 0.0, 'description': 'Standard_D8s_v3 instances', 'specs': '8 vCPUs, 32GB RAM', 'vectordb_node': { 'instance_type': 'Standard_D16s_v3', 'cost_per_hour': 0.768, 'specs': '16 vCPUs, 64GB RAM' }, 'jump_host': { 'instance_type': 'Standard_D2s_v3', 'cost_per_hour': 0.096, 'specs': '2 vCPUs, 8GB RAM' }, 'additional_services': { 'VNet': {'cost_per_hour': 0.0, 'description': 'Virtual Network (Free)'}, 'Managed_Disks': {'cost_per_gb_month': 0.10, 'description': 'Managed Disks (1TB expandable)'}, 'Load_Balancer': {'cost_per_hour': 0.025, 'description': 'Azure Load Balancer'}, 'Public_IP': {'cost_per_hour': 0.005, 'description': 'Public IP Address'} } }, 'GCP': { 'name': 'Google Kubernetes Engine', 'cost_per_node_hour': 0.379, 'managed_k8s_cost': 0.10, 'description': 'n1-standard-8 instances', 'specs': '8 vCPUs, 30GB RAM', 'vectordb_node': { 'instance_type': 'n1-standard-16', 'cost_per_hour': 0.758, 'specs': '16 vCPUs, 60GB RAM' }, 'jump_host': { 'instance_type': 'e2-medium', 'cost_per_hour': 0.067, 'specs': '2 vCPUs, 8GB RAM' }, 'additional_services': { 'VPC': {'cost_per_hour': 0.0, 'description': 'Virtual Private Cloud (Free)'}, 'Persistent_Disk': {'cost_per_gb_month': 0.10, 'description': 'Persistent Disk (1TB expandable)'}, 'Load_Balancer': {'cost_per_hour': 0.025, 'description': 'Cloud Load Balancing'}, 'Static_IP': {'cost_per_hour': 0.004, 'description': 'Static External IP'}, 'Cloud_Storage': {'cost_per_gb_month': 0.020, 'description': 'GCS Bucket (Optional)'}, 'Filestore': {'cost_per_gb_month': 0.20, 'description': 'Filestore (depends on usage)'} } } } # Production-grade model specifications MODELS = { "Llama 4 Maverick": { "params": 400, "active_params": 17, "memory_per_param": 2, "max_context": 1000000, "base_tps": 4200, "org": "Meta", "license": "Open-weight", "notes": "Multimodal MoE; 1M context; text, image, code, reasoning" }, "Llama 4 Scout": { "params": 109, "active_params": 17, "memory_per_param": 2, "max_context": 10000000, "base_tps": 4500, "org": "Meta", "license": "Open-weight", "notes": "Multimodal MoE; 10M context; efficient for long-form tasks" }, "Llama 3.3 70B": { "params": 70, "active_params": 70, "memory_per_param": 2, "max_context": 128000, "base_tps": 1800, "org": "Meta", "license": "Community (open)", "notes": "Multilingual; matches Llama 3.1 405B performance" }, "Qwen2 110B": { "params": 110, "active_params": 110, "memory_per_param": 2, "max_context": 128000, "base_tps": 1200, "org": "Alibaba/Qwen", "license": "Apache 2.0", "notes": "Multilingual; top-tier reasoning and coding" }, "DeepSeek-VL 110B": { "params": 110, "active_params": 110, "memory_per_param": 2, "max_context": 128000, "base_tps": 1100, "org": "DeepSeek AI", "license": "MIT", "notes": "Multimodal (vision+language); GPT-4V alternative" }, "Mixtral 8x22B": { "params": 141, "active_params": 39, "memory_per_param": 2, "max_context": 65536, "base_tps": 2800, "org": "Mistral AI", "license": "Apache 2.0", "notes": "Sparse MoE; efficiency leader among MoE models" } } GPUS = { "H200 141GB": { "memory": 141, "compute": 9.0, "tps_min": 5486, "tps_max": 18690, "efficiency_tier": "Flagship+", "pricing": { "aws": 15.70, "azure": 12.29, "gcp": "NA", "on-premise": 8.64 # 55% of AWS price (hardware amortization + power) } }, "H100 80GB": { "memory": 80, "compute": 9.0, "tps_min": 2400, "tps_max": 14000, "efficiency_tier": "Flagship", "pricing": { "aws": 6.01, "azure": 6.98, "gcp": 11.06, "on-premise": 3.31 # 55% of AWS price } }, "A100 80GB": { "memory": 80, "compute": 8.0, "tps_min": 1100, "tps_max": 2000, "efficiency_tier": "Excellent", "pricing": { "aws": 3.43, "azure": 3.67, "gcp": 2.48, "on-premise": 1.89 # 55% of AWS price } }, "A100 40GB": { "memory": 40, "compute": 8.0, "tps_min": 1000, "tps_max": 1800, "efficiency_tier": "Good", "pricing": { "aws": 2.75, "azure": 3.67, "gcp": 1.46, "on-premise": 1.51 # 55% of AWS price } }, "L40S": { "memory": 48, "compute": 8.9, "tps_min": 4000, "tps_max": 4768, "efficiency_tier": "Very Good", "pricing": { "aws": 1.67, "azure": "NA", "gcp": "NA", "on-premise": 0.92 # 55% of AWS price } } } @st.cache_data(show_spinner=False, ttl=300) def calculate_detailed_infrastructure(num_tenants, apps_per_tenant): """Calculate detailed infrastructure requirements with node type breakdown - CACHED""" # Standard node specs (8 vCPUs, 32GB RAM) cores_per_node = 8 ram_per_node = 32 # VectorDB node specs (16 vCPUs, 64GB RAM) - Updated as per requirement vectordb_cores_per_node = 16 vectordb_ram_per_node = 64 # Base infrastructure base_platform_nodes = 2 # Per tenant requirements platform_nodes_per_tenant = 1 compute_nodes_per_tenant = 1 vectordb_nodes_per_tenant = 1 # Using 64GB RAM nodes for VectorDB # Calculate deployment nodes based on apps per tenant # Every 4 apps need 1 deployment node deploy_nodes_per_tenant = math.ceil(apps_per_tenant / 4) # Calculate totals total_platform_nodes = base_platform_nodes + (platform_nodes_per_tenant * num_tenants) total_compute_nodes = compute_nodes_per_tenant * num_tenants total_deploy_nodes = deploy_nodes_per_tenant * num_tenants total_vectordb_nodes = vectordb_nodes_per_tenant * num_tenants # Total standard nodes (excluding VectorDB which uses different specs) total_standard_nodes = total_platform_nodes + total_compute_nodes + total_deploy_nodes total_nodes = total_standard_nodes + total_vectordb_nodes # Resource calculations total_cpu = (total_standard_nodes * cores_per_node) + (total_vectordb_nodes * vectordb_cores_per_node) total_ram = (total_standard_nodes * ram_per_node) + (total_vectordb_nodes * vectordb_ram_per_node) # Applications capacity total_apps = num_tenants * apps_per_tenant return { 'node_breakdown': { 'Platform Nodes': { 'base': base_platform_nodes, 'tenant': platform_nodes_per_tenant * num_tenants, 'total': total_platform_nodes, 'cores': total_platform_nodes * cores_per_node, 'ram': total_platform_nodes * ram_per_node, 'purpose': 'Tenancy Manager + Tenant platform services', 'node_type': 'Standard (8 vCPU, 32GB RAM)' }, 'Compute Nodes': { 'base': 0, 'tenant': total_compute_nodes, 'total': total_compute_nodes, 'cores': total_compute_nodes * cores_per_node, 'ram': total_compute_nodes * ram_per_node, 'purpose': 'Computational workloads', 'node_type': 'Standard (8 vCPU, 32GB RAM)' }, 'Deploy Nodes': { 'base': 0, 'tenant': total_deploy_nodes, 'total': total_deploy_nodes, 'cores': total_deploy_nodes * cores_per_node, 'ram': total_deploy_nodes * ram_per_node, 'purpose': f'Application deployment ({deploy_nodes_per_tenant} node(s) per {apps_per_tenant} apps)', 'node_type': 'Standard (8 vCPU, 32GB RAM)' }, 'VectorDB Nodes': { 'base': 0, 'tenant': total_vectordb_nodes, 'total': total_vectordb_nodes, 'cores': total_vectordb_nodes * vectordb_cores_per_node, 'ram': total_vectordb_nodes * vectordb_ram_per_node, 'purpose': 'Vector database operations (high memory)', 'node_type': 'High-Memory (16 vCPU, 64GB RAM)' } }, 'totals': { 'total_nodes': total_nodes, 'total_standard_nodes': total_standard_nodes, 'total_vectordb_nodes': total_vectordb_nodes, 'total_cpu': total_cpu, 'total_ram': total_ram, 'total_apps': total_apps, 'deploy_nodes_per_tenant': deploy_nodes_per_tenant }, 'specs': { 'cores_per_node': cores_per_node, 'ram_per_node': ram_per_node, 'vectordb_cores_per_node': vectordb_cores_per_node, 'vectordb_ram_per_node': vectordb_ram_per_node } } def calculate_model_memory_requirements(model_params, active_params, precision_bytes): """Calculate memory requirements for model inference""" model_memory = model_params * precision_bytes overhead = model_memory * 0.25 kv_cache = model_memory * 0.1 total_memory = model_memory + overhead + kv_cache return total_memory def calculate_model_tps_on_gpu(model_base_tps, model_params, active_params, gpu_spec): """Calculate actual TPS for a specific model on a specific GPU""" effective_params = active_params reference_params = 70 param_scaling = (reference_params / effective_params) ** 0.7 gpu_tps_min = gpu_spec["tps_min"] gpu_tps_max = gpu_spec["tps_max"] actual_tps_min = gpu_tps_min * param_scaling actual_tps_max = gpu_tps_max * param_scaling estimated_tps = actual_tps_min + (actual_tps_max - actual_tps_min) * 0.3 return estimated_tps, actual_tps_min, actual_tps_max def calculate_gpu_node_configurations(total_gpus_needed, gpu_memory_gb, gpu_spec): """Calculate GPU node configurations based on standard cloud GPU node sizes""" configurations = [] # Standard GPU node configurations: 1, 2, 4, 8 GPUs per node standard_configs = [1, 2, 4, 8] # Minimum GPUs per node based on memory requirements min_gpus_per_node = math.ceil(gpu_memory_gb / gpu_spec["memory"]) for gpus_per_node in standard_configs: # Skip configurations that can't fit the model memory requirement if gpus_per_node < min_gpus_per_node: continue num_nodes = math.ceil(total_gpus_needed / gpus_per_node) total_gpus_allocated = num_nodes * gpus_per_node gpu_utilization = (total_gpus_needed / total_gpus_allocated) * 100 gpu_waste = total_gpus_allocated - total_gpus_needed configurations.append({ 'gpus_per_node': gpus_per_node, 'num_nodes': num_nodes, 'total_gpus_allocated': total_gpus_allocated, 'total_gpus_needed': total_gpus_needed, 'utilization': gpu_utilization, 'gpu_waste': gpu_waste, 'meets_memory_req': gpus_per_node >= min_gpus_per_node, 'memory_utilization': (gpu_memory_gb / (gpus_per_node * gpu_spec["memory"])) * 100 }) # If no configurations work (shouldn't happen with proper validation), add all configs if not configurations: for gpus_per_node in standard_configs: num_nodes = math.ceil(total_gpus_needed / gpus_per_node) total_gpus_allocated = num_nodes * gpus_per_node gpu_utilization = (total_gpus_needed / total_gpus_allocated) * 100 gpu_waste = total_gpus_allocated - total_gpus_needed configurations.append({ 'gpus_per_node': gpus_per_node, 'num_nodes': num_nodes, 'total_gpus_allocated': total_gpus_allocated, 'total_gpus_needed': total_gpus_needed, 'utilization': gpu_utilization, 'gpu_waste': gpu_waste, 'meets_memory_req': gpus_per_node >= min_gpus_per_node, 'memory_utilization': (gpu_memory_gb / (gpus_per_node * gpu_spec["memory"])) * 100 }) # Sort by utilization (descending) and then by total GPUs (ascending) configurations.sort(key=lambda x: (-x['utilization'], x['total_gpus_allocated'])) return configurations, min_gpus_per_node @st.cache_data(show_spinner=False, ttl=300) def calculate_gpu_requirements(conversations_per_minute, tokens_per_conversation, model_spec, gpu_spec, precision_bytes): """Calculate GPU requirements for LLM inference with proper node configurations - CACHED""" # Calculate throughput requirements required_tps = (conversations_per_minute * tokens_per_conversation) / 60 # Calculate memory requirements model_memory_gb = calculate_model_memory_requirements( model_spec["params"], model_spec["active_params"], precision_bytes ) # Calculate model performance on GPU estimated_tps, tps_min, tps_max = calculate_model_tps_on_gpu( model_spec["base_tps"], model_spec["params"], model_spec["active_params"], gpu_spec ) # Calculate basic GPU requirements gpus_needed_memory = math.ceil(model_memory_gb / gpu_spec["memory"]) gpus_needed_throughput = math.ceil(required_tps / estimated_tps) total_gpus_needed = max(gpus_needed_memory, gpus_needed_throughput, 1) # Calculate proper GPU node configurations gpu_configs, min_gpus_per_node = calculate_gpu_node_configurations( total_gpus_needed, model_memory_gb, gpu_spec ) # Use the best (most efficient) configuration best_config = gpu_configs[0] if gpu_configs else None actual_gpus_allocated = best_config['total_gpus_allocated'] if best_config else total_gpus_needed return { 'gpus_needed_memory': gpus_needed_memory, 'gpus_needed_throughput': gpus_needed_throughput, 'total_gpus_needed': total_gpus_needed, 'actual_gpus_allocated': actual_gpus_allocated, 'gpu_configurations': gpu_configs, 'best_config': best_config, 'min_gpus_per_node': min_gpus_per_node, 'model_memory_gb': model_memory_gb, 'required_tps': required_tps, 'estimated_tps': estimated_tps, 'tps_range': (tps_min, tps_max), 'total_system_tps': estimated_tps * actual_gpus_allocated, 'max_conversations_per_minute': (estimated_tps * actual_gpus_allocated * 60) / tokens_per_conversation, 'bottleneck': 'Memory' if gpus_needed_memory >= gpus_needed_throughput else 'Throughput' } def is_gpu_available_for_provider(provider, gpu_spec): """Check if GPU is actually available for a provider (not N/A and has valid pricing)""" gpu_pricing = gpu_spec.get("pricing", {}) provider_key = provider.lower() if provider_key not in gpu_pricing: return False price = gpu_pricing[provider_key] return price != "NA" and isinstance(price, (int, float)) and price > 0 def get_available_providers_for_gpu(gpu_spec): """Get list of providers that actually have the selected GPU available""" available_providers = [] for provider in CLOUD_PRICING.keys(): if is_gpu_available_for_provider(provider, gpu_spec): available_providers.append(provider) return available_providers def create_downloadable_cost_report(all_costs, infrastructure, gpu_requirements, model_spec, gpu_spec, selected_model, selected_gpu, num_tenants, apps_per_tenant, conversations_per_minute, tokens_per_conversation, precision, time_period): """Create a comprehensive cost report for download""" report_data = { 'report_metadata': { 'generated_at': datetime.now().isoformat(), 'configuration': { 'tenants': num_tenants, 'apps_per_tenant': apps_per_tenant, 'total_apps': num_tenants * apps_per_tenant, 'model': selected_model, 'gpu': selected_gpu, 'precision': precision, 'conversations_per_minute': conversations_per_minute, 'tokens_per_conversation': tokens_per_conversation, 'time_period': time_period } }, 'infrastructure_summary': { 'platform_nodes': infrastructure['totals']['total_standard_nodes'], 'vectordb_nodes': infrastructure['totals']['total_vectordb_nodes'], 'total_nodes': infrastructure['totals']['total_nodes'], 'gpu_nodes': gpu_requirements['total_gpus_needed'], 'total_cpu_cores': infrastructure['totals']['total_cpu'], 'total_ram_gb': infrastructure['totals']['total_ram'], 'total_gpu_memory_gb': gpu_requirements['total_gpus_needed'] * gpu_spec['memory'], 'max_conversations_per_minute': gpu_requirements['max_conversations_per_minute'] }, 'cost_breakdown_by_provider': {} } # Add cost breakdown for each provider for provider, costs in all_costs.items(): provider_available = is_gpu_available_for_provider(provider, gpu_spec) report_data['cost_breakdown_by_provider'][provider] = { 'gpu_available': provider_available, 'platform_costs': { 'kubernetes_nodes': costs['platform_costs']['total_node_cost'], 'vectordb_nodes': costs['platform_costs']['vectordb_node_cost'], 'jump_host': costs['platform_costs']['jump_host_cost'], 'additional_services': costs['platform_costs']['additional_services_cost'], 'k8s_management': costs['platform_costs']['k8s_management_cost'], 'platform_total': costs['platform_costs']['platform_total'] }, 'gpu_costs': { 'gpu_count': costs['gpu_costs']['gpu_count'], 'gpu_cost_per_hour': costs['gpu_costs']['gpu_cost_per_hour'], 'total_gpu_cost': costs['gpu_costs']['total_gpu_cost'] if provider_available else 'N/A' }, 'totals': { 'platform_cost': costs['totals']['platform_cost'], 'gpu_cost': costs['totals']['gpu_cost'] if provider_available else 'N/A', 'total_cost': costs['totals']['total_cost'] if provider_available else 'N/A', 'cost_per_hour': costs['totals']['cost_per_hour'] if provider_available else 'N/A', 'cost_per_day': costs['totals']['cost_per_day'] if provider_available else 'N/A' }, 'service_details': costs['platform_costs']['service_costs'] } return report_data def format_cost_for_display(cost, available=True): """Format cost for display, handling N/A cases""" if not available or cost == 'N/A': return 'N/A' return f"${cost:.2f}" def calculate_detailed_costs(provider, infrastructure, gpu_requirements, gpu_spec, days=30): """Calculate detailed costs for both platform and GPU infrastructure""" pricing = CLOUD_PRICING[provider] hours = days * 24 # Platform infrastructure costs node_costs = {} total_standard_node_cost = 0 total_vectordb_node_cost = 0 for node_type, details in infrastructure['node_breakdown'].items(): if node_type == 'VectorDB Nodes': # Use special pricing for VectorDB nodes node_cost = details['total'] * pricing['vectordb_node']['cost_per_hour'] * hours total_vectordb_node_cost = node_cost else: # Use standard pricing for other nodes node_cost = details['total'] * pricing['cost_per_node_hour'] * hours total_standard_node_cost += node_cost node_costs[node_type] = { 'count': details['total'], 'cost': node_cost, 'cores': details['cores'], 'ram': details['ram'], 'node_type': details.get('node_type', 'Standard') } total_node_cost = total_standard_node_cost + total_vectordb_node_cost # Jump Host cost jump_host_cost = pricing['jump_host']['cost_per_hour'] * hours # Additional services costs additional_services_cost = 0 service_costs = {} for service, details in pricing['additional_services'].items(): if 'cost_per_hour' in details: service_cost = details['cost_per_hour'] * hours elif 'cost_per_gb_month' in details: if 'storage' in service.lower() or 'disk' in service.lower() or 'ebs' in service.lower() or 'san' in service.lower(): service_cost = details['cost_per_gb_month'] * 1024 * (days / 30) else: service_cost = 0 else: service_cost = 0 service_costs[service] = service_cost additional_services_cost += service_cost # Kubernetes management cost k8s_management_cost = pricing['managed_k8s_cost'] * hours # GPU costs - properly handle N/A cases gpu_pricing = gpu_spec.get("pricing", {}) gpu_available = is_gpu_available_for_provider(provider, gpu_spec) gpu_cost_per_hour = 0 gpu_cost = 0 if gpu_available: gpu_cost_per_hour = gpu_pricing[provider.lower()] gpu_cost = gpu_requirements['actual_gpus_allocated'] * gpu_cost_per_hour * hours # Total costs platform_cost = total_node_cost + jump_host_cost + additional_services_cost + k8s_management_cost total_cost = platform_cost + gpu_cost if gpu_available else None # None for N/A cases return { 'platform_costs': { 'node_costs': node_costs, 'total_node_cost': total_node_cost, 'vectordb_node_cost': total_vectordb_node_cost, 'jump_host_cost': jump_host_cost, 'service_costs': service_costs, 'additional_services_cost': additional_services_cost, 'k8s_management_cost': k8s_management_cost, 'platform_total': platform_cost }, 'gpu_costs': { 'gpu_count': gpu_requirements['actual_gpus_allocated'], 'gpu_cost_per_hour': gpu_cost_per_hour, 'total_gpu_cost': gpu_cost, 'gpu_available': gpu_available }, 'totals': { 'platform_cost': platform_cost, 'gpu_cost': gpu_cost, 'total_cost': total_cost, 'cost_per_hour': total_cost / hours if total_cost is not None else None, 'cost_per_day': total_cost / days if total_cost is not None else None, 'gpu_available': gpu_available } } def create_comprehensive_dashboard(): st.set_page_config( page_title="Katonic Multitenant Infrastructure Calculator", page_icon="🚀", layout="wide" ) st.title("🚀 Katonic Multitenant Infrastructure Calculator") st.markdown("**Comprehensive infrastructure planning for multi-tenant LLMOPS platforms with GPU-accelerated LLM inference**") # Sidebar Configuration with st.sidebar: st.header("🔧 Configuration") # Platform Configuration st.subheader("Platform Settings") num_tenants = st.slider( "Number of Tenants", min_value=1, max_value=20, value=3, help="Each tenant requires dedicated platform, compute, deploy, and VectorDB nodes" ) apps_per_tenant = st.number_input( "Apps per Tenant", min_value=1, max_value=50, value=4, step=1, help="Number of applications per tenant. Every 4 apps require 1 deployment node" ) # Cloud Provider Pricing Configuration st.subheader("Cloud Provider Pricing (Optional)") # AWS Pricing with st.expander("☁️ Customize AWS Costs", expanded=False): st.markdown("**Adjust AWS pricing (per hour in USD)**") st.markdown("##### Compute Nodes") aws_standard_node = st.number_input( "m5.2xlarge (8 vCPU, 32GB)", min_value=0.01, max_value=2.00, value=0.384, step=0.01, format="%.3f", key="aws_standard", help="Default: $0.384/hr" ) aws_vectordb_node = st.number_input( "m5.4xlarge (16 vCPU, 64GB)", min_value=0.01, max_value=4.00, value=0.768, step=0.01, format="%.3f", key="aws_vectordb", help="Default: $0.768/hr" ) aws_jump_host = st.number_input( "m5.large (2 vCPU, 8GB)", min_value=0.01, max_value=0.50, value=0.096, step=0.01, format="%.3f", key="aws_jump", help="Default: $0.096/hr" ) aws_k8s_management = st.number_input( "EKS Management Cost", min_value=0.0, max_value=0.50, value=0.10, step=0.01, format="%.3f", key="aws_k8s", help="Default: $0.10/hr" ) st.markdown("##### GPU Pricing") col1, col2 = st.columns(2) with col1: aws_h200 = st.number_input("H200 141GB", value=15.70, step=0.10, format="%.2f", key="aws_h200") aws_h100 = st.number_input("H100 80GB", value=6.01, step=0.10, format="%.2f", key="aws_h100") aws_a100_80 = st.number_input("A100 80GB", value=3.43, step=0.10, format="%.2f", key="aws_a100_80") with col2: aws_a100_40 = st.number_input("A100 40GB", value=2.75, step=0.10, format="%.2f", key="aws_a100_40") aws_l40s = st.number_input("L40S", value=1.67, step=0.10, format="%.2f", key="aws_l40s") # Azure Pricing with st.expander("☁️ Customize Azure Costs", expanded=False): st.markdown("**Adjust Azure pricing (per hour in USD)**") st.markdown("##### Compute Nodes") azure_standard_node = st.number_input( "Standard_D8s_v3 (8 vCPU, 32GB)", min_value=0.01, max_value=2.00, value=0.384, step=0.01, format="%.3f", key="azure_standard", help="Default: $0.384/hr" ) azure_vectordb_node = st.number_input( "Standard_D16s_v3 (16 vCPU, 64GB)", min_value=0.01, max_value=4.00, value=0.768, step=0.01, format="%.3f", key="azure_vectordb", help="Default: $0.768/hr" ) azure_jump_host = st.number_input( "Standard_D2s_v3 (2 vCPU, 8GB)", min_value=0.01, max_value=0.50, value=0.096, step=0.01, format="%.3f", key="azure_jump", help="Default: $0.096/hr" ) azure_k8s_management = st.number_input( "AKS Management Cost", min_value=0.0, max_value=0.50, value=0.0, step=0.01, format="%.3f", key="azure_k8s", help="Default: $0.00/hr (Free tier)" ) st.markdown("##### GPU Pricing") col1, col2 = st.columns(2) with col1: azure_h200 = st.number_input("H200 141GB", value=12.29, step=0.10, format="%.2f", key="azure_h200") azure_h100 = st.number_input("H100 80GB", value=6.98, step=0.10, format="%.2f", key="azure_h100") azure_a100_80 = st.number_input("A100 80GB", value=3.67, step=0.10, format="%.2f", key="azure_a100_80") with col2: azure_a100_40 = st.number_input("A100 40GB", value=3.67, step=0.10, format="%.2f", key="azure_a100_40") # GCP Pricing with st.expander("☁️ Customize GCP Costs", expanded=False): st.markdown("**Adjust GCP pricing (per hour in USD)**") st.markdown("##### Compute Nodes") gcp_standard_node = st.number_input( "n1-standard-8 (8 vCPU, 30GB)", min_value=0.01, max_value=2.00, value=0.379, step=0.01, format="%.3f", key="gcp_standard", help="Default: $0.379/hr" ) gcp_vectordb_node = st.number_input( "n1-standard-16 (16 vCPU, 60GB)", min_value=0.01, max_value=4.00, value=0.758, step=0.01, format="%.3f", key="gcp_vectordb", help="Default: $0.758/hr" ) gcp_jump_host = st.number_input( "e2-medium (2 vCPU, 8GB)", min_value=0.01, max_value=0.50, value=0.067, step=0.01, format="%.3f", key="gcp_jump", help="Default: $0.067/hr" ) gcp_k8s_management = st.number_input( "GKE Management Cost", min_value=0.0, max_value=0.50, value=0.10, step=0.01, format="%.3f", key="gcp_k8s", help="Default: $0.10/hr" ) st.markdown("##### GPU Pricing") col1, col2 = st.columns(2) with col1: gcp_h100 = st.number_input("H100 80GB", value=11.06, step=0.10, format="%.2f", key="gcp_h100") gcp_a100_80 = st.number_input("A100 80GB", value=2.48, step=0.10, format="%.2f", key="gcp_a100_80") with col2: gcp_a100_40 = st.number_input("A100 40GB", value=1.46, step=0.10, format="%.2f", key="gcp_a100_40") # On-Premise Pricing with st.expander("🏢 Customize On-Premise Costs", expanded=False): st.markdown("**Adjust on-premise costs based on your infrastructure**") st.markdown("##### Compute Nodes (per hour)") onprem_standard_node = st.number_input( "Standard Node (8 vCPU, 32GB)", min_value=0.01, max_value=1.00, value=0.192, step=0.01, format="%.3f", key="onprem_standard", help="Cost per hour for standard compute nodes (default: $0.192)" ) onprem_vectordb_node = st.number_input( "VectorDB Node (16 vCPU, 64GB)", min_value=0.01, max_value=2.00, value=0.384, step=0.01, format="%.3f", key="onprem_vectordb", help="Cost per hour for high-memory VectorDB nodes (default: $0.384)" ) onprem_jump_host = st.number_input( "Jump Host (2 vCPU, 8GB)", min_value=0.01, max_value=0.50, value=0.048, step=0.01, format="%.3f", key="onprem_jump", help="Cost per hour for jump host (default: $0.048)" ) st.markdown("##### GPU Pricing Multiplier") onprem_gpu_multiplier = st.slider( "GPU Cost Multiplier (% of AWS)", min_value=30, max_value=100, value=55, step=5, key="onprem_gpu_mult", help="Percentage of AWS GPU pricing for on-premise (default: 55%)" ) / 100 st.markdown("##### Additional Services (per hour)") onprem_network = st.number_input( "Network Infrastructure", min_value=0.0, max_value=0.10, value=0.020, step=0.005, format="%.3f", key="onprem_network", help="Switches, routers, firewalls (default: $0.020)" ) onprem_storage_per_gb = st.number_input( "Storage (per GB per month)", min_value=0.01, max_value=0.20, value=0.05, step=0.01, format="%.3f", key="onprem_storage", help="SAN/NAS storage cost (default: $0.05/GB/month)" ) onprem_load_balancer = st.number_input( "Hardware Load Balancer", min_value=0.0, max_value=0.05, value=0.010, step=0.005, format="%.3f", key="onprem_lb", help="Load balancer amortized cost (default: $0.010)" ) onprem_power_cooling = st.number_input( "Power & Cooling", min_value=0.01, max_value=0.10, value=0.030, step=0.005, format="%.3f", key="onprem_power", help="Datacenter power and cooling (default: $0.030)" ) onprem_datacenter_space = st.number_input( "Datacenter Space", min_value=0.0, max_value=0.05, value=0.015, step=0.005, format="%.3f", key="onprem_space", help="Rack space and facilities (default: $0.015)" ) onprem_maintenance = st.number_input( "Maintenance & Support", min_value=0.0, max_value=0.10, value=0.025, step=0.005, format="%.3f", key="onprem_maint", help="Hardware maintenance contracts (default: $0.025)" ) onprem_k8s_management = st.number_input( "K8s Management Cost", min_value=0.0, max_value=0.20, value=0.05, step=0.01, format="%.3f", key="onprem_k8s", help="Self-managed K8s operational cost (default: $0.05)" ) # Reset button if st.button("🔄 Reset All Pricing to Defaults", type="secondary"): st.rerun() # LLM Configuration st.subheader("LLM Settings") selected_model = st.selectbox( "Select LLM Model", list(MODELS.keys()), index=2, # Default to Llama 3.3 70B help="Choose the LLM model for inference workloads" ) selected_gpu = st.selectbox( "Select GPU Type", list(GPUS.keys()), index=1, # Default to H100 80GB help="GPU type for LLM inference nodes" ) precision = st.selectbox( "Model Precision", ["FP16", "INT8", "INT4"], index=0, # Default to FP16 help="Model precision affects memory usage and quality" ) # Workload Configuration st.subheader("Workload Settings") conversations_per_minute = st.number_input( "Conversations per Minute", min_value=1, max_value=5000, value=200, step=10, help="Expected conversation throughput across all tenants" ) tokens_per_conversation = st.number_input( "Tokens per Conversation", min_value=500, max_value=20000, value=2000, step=100, help="Average tokens per conversation (input + output)" ) # Time period time_period = st.selectbox( "Cost Calculation Period", ["Monthly (30 days)", "Weekly (7 days)", "Daily (1 day)", "Hourly"], index=0 ) days_map = { "Monthly (30 days)": 30, "Weekly (7 days)": 7, "Daily (1 day)": 1, "Hourly": 1/24 } days = days_map[time_period] # Calculate all requirements infrastructure = calculate_detailed_infrastructure(num_tenants, apps_per_tenant) # Apply custom pricing - create modified copies to avoid global state issues def apply_custom_pricing(): """Apply user-configured pricing to global dictionaries""" # Update AWS pricing with user-configured values CLOUD_PRICING['AWS']['cost_per_node_hour'] = aws_standard_node CLOUD_PRICING['AWS']['vectordb_node']['cost_per_hour'] = aws_vectordb_node CLOUD_PRICING['AWS']['jump_host']['cost_per_hour'] = aws_jump_host CLOUD_PRICING['AWS']['managed_k8s_cost'] = aws_k8s_management # Update AWS GPU pricing GPUS["H200 141GB"]["pricing"]["aws"] = aws_h200 GPUS["H100 80GB"]["pricing"]["aws"] = aws_h100 GPUS["A100 80GB"]["pricing"]["aws"] = aws_a100_80 GPUS["A100 40GB"]["pricing"]["aws"] = aws_a100_40 GPUS["L40S"]["pricing"]["aws"] = aws_l40s # Update Azure pricing with user-configured values CLOUD_PRICING['Azure']['cost_per_node_hour'] = azure_standard_node CLOUD_PRICING['Azure']['vectordb_node']['cost_per_hour'] = azure_vectordb_node CLOUD_PRICING['Azure']['jump_host']['cost_per_hour'] = azure_jump_host CLOUD_PRICING['Azure']['managed_k8s_cost'] = azure_k8s_management # Update Azure GPU pricing GPUS["H200 141GB"]["pricing"]["azure"] = azure_h200 GPUS["H100 80GB"]["pricing"]["azure"] = azure_h100 GPUS["A100 80GB"]["pricing"]["azure"] = azure_a100_80 GPUS["A100 40GB"]["pricing"]["azure"] = azure_a100_40 # Update GCP pricing with user-configured values CLOUD_PRICING['GCP']['cost_per_node_hour'] = gcp_standard_node CLOUD_PRICING['GCP']['vectordb_node']['cost_per_hour'] = gcp_vectordb_node CLOUD_PRICING['GCP']['jump_host']['cost_per_hour'] = gcp_jump_host CLOUD_PRICING['GCP']['managed_k8s_cost'] = gcp_k8s_management # Update GCP GPU pricing GPUS["H100 80GB"]["pricing"]["gcp"] = gcp_h100 GPUS["A100 80GB"]["pricing"]["gcp"] = gcp_a100_80 GPUS["A100 40GB"]["pricing"]["gcp"] = gcp_a100_40 # Update On-Premise pricing with user-configured values CLOUD_PRICING['On-Premise']['cost_per_node_hour'] = onprem_standard_node CLOUD_PRICING['On-Premise']['vectordb_node']['cost_per_hour'] = onprem_vectordb_node CLOUD_PRICING['On-Premise']['jump_host']['cost_per_hour'] = onprem_jump_host CLOUD_PRICING['On-Premise']['managed_k8s_cost'] = onprem_k8s_management # Update on-premise additional services CLOUD_PRICING['On-Premise']['additional_services'] = { 'Network_Infrastructure': {'cost_per_hour': onprem_network, 'description': 'Switches, routers, firewalls (amortized)'}, 'Storage_SAN': {'cost_per_gb_month': onprem_storage_per_gb, 'description': 'SAN/NAS storage (1TB base, amortized)'}, 'Hardware_Load_Balancer': {'cost_per_hour': onprem_load_balancer, 'description': 'F5/Citrix ADC (amortized)'}, 'Power_Cooling': {'cost_per_hour': onprem_power_cooling, 'description': 'Datacenter power and cooling'}, 'Datacenter_Space': {'cost_per_hour': onprem_datacenter_space, 'description': 'Rack space and facilities costs'}, 'Maintenance_Support': {'cost_per_hour': onprem_maintenance, 'description': 'Hardware maintenance and vendor support contracts'} } # Update on-premise GPU pricing based on AWS prices and multiplier for gpu_name in GPUS.keys(): if 'aws' in GPUS[gpu_name]['pricing'] and GPUS[gpu_name]['pricing']['aws'] != 'NA': aws_price = GPUS[gpu_name]['pricing']['aws'] GPUS[gpu_name]['pricing']['on-premise'] = round(aws_price * onprem_gpu_multiplier, 2) # Apply all custom pricing apply_custom_pricing() precision_bytes = { "FP16": 2, "INT8": 1, "INT4": 0.5 }[precision] model_spec = MODELS[selected_model] gpu_spec = GPUS[selected_gpu] gpu_requirements = calculate_gpu_requirements( conversations_per_minute, tokens_per_conversation, model_spec, gpu_spec, precision_bytes ) # Main Dashboard st.header("📊 Infrastructure Overview") st.markdown("---") # Visual separator # Row 1: Core Metrics - Use 4 columns for better spacing col1, col2, col3, col4 = st.columns(4) with col1: st.metric( label="🏢 Total Tenants", value=f"{num_tenants}", help="Number of tenant environments" ) with col2: st.metric( label="📦 Apps per Tenant", value=f"{apps_per_tenant}", help=f"Total applications: {infrastructure['totals']['total_apps']}" ) with col3: st.metric( label="🖥️ Worker Nodes", value=f"{infrastructure['totals']['total_nodes']}", help=f"Standard: {infrastructure['totals']['total_standard_nodes']}, VectorDB: {infrastructure['totals']['total_vectordb_nodes']}" ) with col4: gpu_display = f"{gpu_requirements['actual_gpus_allocated']} GPUs" if gpu_requirements['best_config']: gpu_detail = f"({gpu_requirements['best_config']['num_nodes']} nodes)" else: gpu_detail = "" st.metric( label="🎮 GPU Resources", value=gpu_display, delta=gpu_detail, help=f"Configuration: {gpu_requirements['best_config']['num_nodes']}×{gpu_requirements['best_config']['gpus_per_node']} GPUs" if gpu_requirements['best_config'] else "GPU allocation" ) # Row 2: Performance Metrics col1, col2, col3, col4 = st.columns(4) with col1: st.metric( label="💬 Target Load", value=f"{conversations_per_minute}", delta="conv/min", help="Target conversation throughput" ) with col2: st.metric( label="📈 Max Capacity", value=f"{gpu_requirements['max_conversations_per_minute']:.0f}", delta="conv/min", help="Maximum system capacity" ) with col3: capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100 st.metric( label="📊 Capacity Headroom", value=f"{headroom_percentage:.1f}%", delta=f"{capacity_headroom:.0f} conv/min available", help="Available capacity beyond current target load" ) with col4: bottleneck_icon = "💾" if gpu_requirements['bottleneck'] == 'Memory' else "⚡" st.metric( label=f"{bottleneck_icon} Bottleneck", value=gpu_requirements['bottleneck'], help="Primary system constraint" ) st.markdown("---") # Visual separator # Create tabs for detailed views tab1, tab2, tab3, tab4, tab5 = st.tabs([ "🗏 Platform Infrastructure", "🖥️ GPU Requirements", "💰 Cost Analysis", "📈 Performance Analysis", "🔧 Technical Specifications" ]) with tab1: st.subheader("Platform Infrastructure Breakdown") # Show deployment node scaling info st.info(f"📦 **Deployment Node Scaling**: {infrastructure['totals']['deploy_nodes_per_tenant']} deployment node(s) per tenant for {apps_per_tenant} apps (1 node per 4 apps)") # Platform nodes breakdown breakdown_data = [] for node_type, details in infrastructure['node_breakdown'].items(): per_tenant_value = details['tenant'] // num_tenants if num_tenants > 0 and details['tenant'] > 0 else 0 breakdown_data.append({ 'Node Type': node_type, 'Base': details['base'] if details['base'] > 0 else '-', 'Per Tenant': per_tenant_value if per_tenant_value > 0 else '-', 'Total': details['total'], 'CPU': details['cores'], 'RAM (GB)': details['ram'], 'VM Type': details.get('node_type', 'Standard'), 'Purpose': details['purpose'] }) breakdown_df = pd.DataFrame(breakdown_data) # Use column configuration for better display st.dataframe( breakdown_df, use_container_width=True, hide_index=True, column_config={ "Node Type": st.column_config.TextColumn("Node Type", width="medium"), "Base": st.column_config.TextColumn("Base", width="small"), "Per Tenant": st.column_config.TextColumn("Per Tenant", width="small"), "Total": st.column_config.NumberColumn("Total", width="small"), "CPU": st.column_config.NumberColumn("CPU", width="small"), "RAM (GB)": st.column_config.NumberColumn("RAM (GB)", width="small"), "VM Type": st.column_config.TextColumn("VM Type", width="medium"), "Purpose": st.column_config.TextColumn("Purpose", width="large") } ) # Visual breakdown col1, col2 = st.columns(2) with col1: # Node distribution pie chart node_counts = {node_type: details['total'] for node_type, details in infrastructure['node_breakdown'].items() if details['total'] > 0} fig_nodes = px.pie( values=list(node_counts.values()), names=list(node_counts.keys()), title="Platform Node Distribution" ) st.plotly_chart(fig_nodes, use_container_width=True) with col2: # Resource distribution resource_data = [] for node_type, details in infrastructure['node_breakdown'].items(): if details['total'] > 0: resource_data.extend([ {'Node Type': node_type, 'Resource': 'CPU Cores', 'Amount': details['cores']}, {'Node Type': node_type, 'Resource': 'RAM (GB)', 'Amount': details['ram']} ]) resource_df = pd.DataFrame(resource_data) fig_resources = px.bar( resource_df, x='Node Type', y='Amount', color='Resource', title='Resource Distribution by Node Type', barmode='group' ) st.plotly_chart(fig_resources, use_container_width=True) # Node type distribution st.subheader("Node Type Distribution") col1, col2 = st.columns(2) with col1: st.metric( "Standard Nodes (8 vCPU, 32GB RAM)", infrastructure['totals']['total_standard_nodes'], help="Platform, Compute, and Deploy nodes" ) with col2: st.metric( "High-Memory Nodes (16 vCPU, 64GB RAM)", infrastructure['totals']['total_vectordb_nodes'], help="VectorDB nodes with higher memory capacity" ) with tab2: st.subheader("GPU Requirements Analysis") # GPU requirements metrics col1, col2, col3, col4 = st.columns(4) with col1: st.metric( "Memory-based GPUs", gpu_requirements['gpus_needed_memory'], help="GPUs needed to fit model in memory" ) with col2: st.metric( "Throughput-based GPUs", gpu_requirements['gpus_needed_throughput'], help="GPUs needed for required throughput" ) with col3: st.metric( "Logical GPUs Needed", gpu_requirements['total_gpus_needed'], help="Minimum GPUs needed (before node configuration)" ) with col4: st.metric( "Actual GPUs Allocated", gpu_requirements['actual_gpus_allocated'], help="GPUs allocated based on standard node configurations", delta=gpu_requirements['actual_gpus_allocated'] - gpu_requirements['total_gpus_needed'] ) # GPU Node Configuration Analysis st.subheader("🖥️ GPU Node Configuration Options") if gpu_requirements['gpu_configurations']: # Display configuration options in a table config_data = [] for config in gpu_requirements['gpu_configurations']: efficiency_score = f"{config['utilization']:.1f}%" memory_compatible = "✅" if config['meets_memory_req'] else "❌" config_data.append({ "GPUs/Node": config['gpus_per_node'], "Mem": memory_compatible, "Nodes": config['num_nodes'], "Total GPUs": config['total_gpus_allocated'], "GPU Util": efficiency_score, "Waste": config['gpu_waste'], "Mem Util": f"{config['memory_utilization']:.1f}%" }) config_df = pd.DataFrame(config_data) st.dataframe( config_df, use_container_width=True, hide_index=True, column_config={ "GPUs/Node": st.column_config.NumberColumn("GPUs/Node", width="small"), "Mem": st.column_config.TextColumn("Mem ✓", width="small"), "Nodes": st.column_config.NumberColumn("Nodes", width="small"), "Total GPUs": st.column_config.NumberColumn("Total GPUs", width="small"), "GPU Util": st.column_config.TextColumn("GPU Util", width="small"), "Waste": st.column_config.NumberColumn("Waste", width="small"), "Mem Util": st.column_config.TextColumn("Mem Util", width="small") } ) # Highlight the recommended configuration if gpu_requirements['best_config']: best = gpu_requirements['best_config'] st.success(f"💡 **Recommended Configuration**: {best['num_nodes']} nodes × {best['gpus_per_node']} GPUs = {best['total_gpus_allocated']} total GPUs ({best['utilization']:.1f}% utilization)") # Show minimum requirement info st.info(f"**Memory Constraint**: Minimum {gpu_requirements['min_gpus_per_node']} GPUs per node required to fit {gpu_requirements['model_memory_gb']:.1f}GB model in {gpu_spec['memory']}GB GPU memory") # GPU configuration visualization col1, col2 = st.columns(2) with col1: # Node configuration comparison if gpu_requirements['gpu_configurations']: config_chart_data = pd.DataFrame(gpu_requirements['gpu_configurations']) fig_configs = px.bar( config_chart_data, x='gpus_per_node', y='utilization', title='GPU Utilization by Node Configuration', labels={'gpus_per_node': 'GPUs per Node', 'utilization': 'Utilization (%)'} ) st.plotly_chart(fig_configs, use_container_width=True) with col2: # GPU allocation vs requirement allocation_data = pd.DataFrame({ 'Metric': ['Required GPUs', 'Allocated GPUs'], 'Count': [gpu_requirements['total_gpus_needed'], gpu_requirements['actual_gpus_allocated']] }) fig_allocation = px.bar( allocation_data, x='Metric', y='Count', title='GPU Allocation vs Requirement', color='Metric' ) st.plotly_chart(fig_allocation, use_container_width=True) # Model and GPU specifications st.subheader("🔧 Model & GPU Specifications") # GPU configuration table gpu_config_data = [{ 'Model': selected_model, 'Parameters': f"{model_spec['params']}B ({model_spec['active_params']}B active)" if model_spec['params'] != model_spec['active_params'] else f"{model_spec['params']}B", 'Model Memory Required': f"{gpu_requirements['model_memory_gb']:.1f} GB", 'GPU Type': selected_gpu, 'GPU Memory per Unit': f"{gpu_spec['memory']} GB", 'GPUs Required (Logic)': gpu_requirements['total_gpus_needed'], 'GPUs Allocated (Actual)': gpu_requirements['actual_gpus_allocated'], 'GPU Nodes': f"{gpu_requirements['best_config']['num_nodes']} nodes × {gpu_requirements['best_config']['gpus_per_node']} GPUs" if gpu_requirements['best_config'] else 'N/A', 'Total GPU Memory': f"{gpu_requirements['actual_gpus_allocated'] * gpu_spec['memory']} GB", 'Memory Utilization': f"{(gpu_requirements['model_memory_gb'] / (gpu_requirements['actual_gpus_allocated'] * gpu_spec['memory']) * 100):.1f}%", 'Precision': precision }] gpu_config_df = pd.DataFrame(gpu_config_data) st.dataframe(gpu_config_df, use_container_width=True) # Performance metrics col1, col2 = st.columns(2) with col1: # TPS comparison tps_data = pd.DataFrame({ 'Metric': ['Required TPS', 'Single GPU TPS', 'Total System TPS'], 'Value': [ gpu_requirements['required_tps'], gpu_requirements['estimated_tps'], gpu_requirements['total_system_tps'] ] }) fig_tps = px.bar( tps_data, x='Metric', y='Value', title='Tokens Per Second Analysis', color='Metric' ) st.plotly_chart(fig_tps, use_container_width=True) with col2: # Capacity utilization utilization_data = pd.DataFrame({ 'Metric': ['Required Capacity', 'Available Capacity'], 'Conversations/Min': [ conversations_per_minute, gpu_requirements['max_conversations_per_minute'] ] }) fig_capacity = px.bar( utilization_data, x='Metric', y='Conversations/Min', title='Conversation Capacity Analysis', color='Metric' ) st.plotly_chart(fig_capacity, use_container_width=True) with tab3: st.subheader("Comprehensive Cost Analysis") # Show customization status for all providers default_values = { 'aws': {'standard': 0.384, 'vectordb': 0.768, 'jump': 0.096, 'k8s': 0.10}, 'azure': {'standard': 0.384, 'vectordb': 0.768, 'jump': 0.096, 'k8s': 0.0}, 'gcp': {'standard': 0.379, 'vectordb': 0.758, 'jump': 0.067, 'k8s': 0.10}, 'onprem': {'standard': 0.192, 'vectordb': 0.384, 'jump': 0.048, 'gpu_mult': 0.55, 'k8s': 0.05} } customizations = [] # Check AWS customizations if (aws_standard_node != default_values['aws']['standard'] or aws_vectordb_node != default_values['aws']['vectordb'] or aws_k8s_management != default_values['aws']['k8s']): customizations.append("AWS") # Check Azure customizations if (azure_standard_node != default_values['azure']['standard'] or azure_vectordb_node != default_values['azure']['vectordb'] or azure_k8s_management != default_values['azure']['k8s']): customizations.append("Azure") # Check GCP customizations if (gcp_standard_node != default_values['gcp']['standard'] or gcp_vectordb_node != default_values['gcp']['vectordb'] or gcp_k8s_management != default_values['gcp']['k8s']): customizations.append("GCP") # Check On-Premise customizations if (onprem_standard_node != default_values['onprem']['standard'] or onprem_vectordb_node != default_values['onprem']['vectordb'] or onprem_gpu_multiplier != default_values['onprem']['gpu_mult'] or onprem_k8s_management != default_values['onprem']['k8s']): customizations.append("On-Premise") if customizations: st.warning(f""" **✏️ Custom Pricing Active for: {', '.join(customizations)}** Using user-configured pricing instead of defaults. View details in Technical Specifications tab or adjust in sidebar. """) # Add info box about cost models st.info(""" **💡 Cost Model Information**: - **Cloud Providers (AWS/Azure/GCP)**: Pay-as-you-go pricing with per-hour compute and GPU costs - **On-Premise**: Hardware amortized over 3-year lifecycle + operating costs (power, cooling, maintenance) - **Customization**: All pricing values can be adjusted in the sidebar to match your actual costs **🔧 Customize:** Use the sidebar "Cloud Provider Pricing" sections to adjust costs """) # Calculate costs for all providers all_costs = {} for provider in CLOUD_PRICING.keys(): all_costs[provider] = calculate_detailed_costs( provider, infrastructure, gpu_requirements, gpu_spec, days ) # Cost comparison table cost_comparison_data = [] for provider, costs in all_costs.items(): gpu_available = costs['totals']['gpu_available'] cost_comparison_data.append({ 'Provider': provider, 'GPU': '✅' if gpu_available else '❌', 'Platform': f"${costs['totals']['platform_cost']:.2f}", 'GPU Cost': format_cost_for_display(costs['totals']['gpu_cost'], gpu_available), 'Total': format_cost_for_display(costs['totals']['total_cost'], gpu_available), 'Per Hour': format_cost_for_display(costs['totals']['cost_per_hour'], gpu_available), 'Per Day': format_cost_for_display(costs['totals']['cost_per_day'], gpu_available), 'Total_Numeric': costs['totals']['total_cost'] if gpu_available else None, 'GPU_Available': gpu_available }) cost_df = pd.DataFrame(cost_comparison_data) display_cost_df = cost_df.drop(['Total_Numeric', 'GPU_Available'], axis=1) st.dataframe( display_cost_df, use_container_width=True, hide_index=True, column_config={ "Provider": st.column_config.TextColumn("Provider", width="medium"), "GPU": st.column_config.TextColumn("GPU ✓", width="small"), "Platform": st.column_config.TextColumn("Platform Cost", width="medium"), "GPU Cost": st.column_config.TextColumn("GPU Cost", width="medium"), "Total": st.column_config.TextColumn("Total Cost", width="medium"), "Per Hour": st.column_config.TextColumn("$/Hour", width="medium"), "Per Day": st.column_config.TextColumn("$/Day", width="medium") } ) # Add download button for cost report report_data = create_downloadable_cost_report( all_costs, infrastructure, gpu_requirements, model_spec, gpu_spec, selected_model, selected_gpu, num_tenants, apps_per_tenant, conversations_per_minute, tokens_per_conversation, precision, time_period ) st.download_button( label="📥 Download Complete Cost Report (JSON)", data=json.dumps(report_data, indent=2), file_name=f"llmops_cost_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", mime="application/json", help="Download comprehensive cost analysis with all services and configurations" ) # Create CSV version for easier viewing csv_data = [] for provider, provider_data in report_data['cost_breakdown_by_provider'].items(): if provider_data['gpu_available']: csv_data.append({ 'Provider': provider, 'GPU_Available': 'Yes', 'Platform_Nodes': infrastructure['totals']['total_standard_nodes'], 'VectorDB_Nodes': infrastructure['totals']['total_vectordb_nodes'], 'GPU_Nodes': gpu_requirements['total_gpus_needed'], 'Kubernetes_Nodes_Cost': provider_data['platform_costs']['kubernetes_nodes'], 'VectorDB_Nodes_Cost': provider_data['platform_costs']['vectordb_nodes'], 'Jump_Host_Cost': provider_data['platform_costs']['jump_host'], 'Additional_Services_Cost': provider_data['platform_costs']['additional_services'], 'K8s_Management_Cost': provider_data['platform_costs']['k8s_management'], 'Total_Platform_Cost': provider_data['platform_costs']['platform_total'], 'GPU_Cost_Per_Hour': provider_data['gpu_costs']['gpu_cost_per_hour'], 'Total_GPU_Cost': provider_data['gpu_costs']['total_gpu_cost'], 'Total_Infrastructure_Cost': provider_data['totals']['total_cost'], 'Cost_Per_Hour': provider_data['totals']['cost_per_hour'], 'Cost_Per_Day': provider_data['totals']['cost_per_day'] }) else: csv_data.append({ 'Provider': provider, 'GPU_Available': 'No', 'Platform_Nodes': infrastructure['totals']['total_standard_nodes'], 'VectorDB_Nodes': infrastructure['totals']['total_vectordb_nodes'], 'GPU_Nodes': 'N/A', 'Kubernetes_Nodes_Cost': provider_data['platform_costs']['kubernetes_nodes'], 'VectorDB_Nodes_Cost': provider_data['platform_costs']['vectordb_nodes'], 'Jump_Host_Cost': provider_data['platform_costs']['jump_host'], 'Additional_Services_Cost': provider_data['platform_costs']['additional_services'], 'K8s_Management_Cost': provider_data['platform_costs']['k8s_management'], 'Total_Platform_Cost': provider_data['platform_costs']['platform_total'], 'GPU_Cost_Per_Hour': 'N/A', 'Total_GPU_Cost': 'N/A', 'Total_Infrastructure_Cost': 'N/A', 'Cost_Per_Hour': 'N/A', 'Cost_Per_Day': 'N/A' }) csv_df = pd.DataFrame(csv_data) csv_string = csv_df.to_csv(index=False) st.download_button( label="📊 Download Cost Summary (CSV)", data=csv_string, file_name=f"llmops_cost_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", mime="text/csv", help="Download cost summary in CSV format for spreadsheet analysis" ) # Cost breakdown visualization - only for providers with GPU available available_providers_data = cost_df[cost_df['GPU_Available'] == True] col1, col2 = st.columns(2) with col1: # Provider comparison - only available providers if not available_providers_data.empty: fig_provider_comparison = px.bar( available_providers_data, x='Provider', y='Total_Numeric', title=f'Total Cost Comparison ({time_period}) - All Deployment Options', labels={'Total_Numeric': 'Total Cost (USD)'}, color='Provider' ) st.plotly_chart(fig_provider_comparison, use_container_width=True) else: st.warning("⚠️ No providers have the selected GPU available for cost comparison") with col2: # Cost breakdown for selected provider (cheapest available) available_providers = get_available_providers_for_gpu(gpu_spec) if available_providers: cheapest_provider = min(available_providers, key=lambda x: all_costs[x]['totals']['total_cost']) cheapest_costs = all_costs[cheapest_provider] breakdown_values = [ cheapest_costs['totals']['platform_cost'], cheapest_costs['totals']['gpu_cost'] ] breakdown_labels = ['Platform Infrastructure', 'GPU Infrastructure'] fig_breakdown = px.pie( values=breakdown_values, names=breakdown_labels, title=f'{cheapest_provider} - Cost Breakdown' ) st.plotly_chart(fig_breakdown, use_container_width=True) else: st.warning("⚠️ No providers have the selected GPU available") # Detailed cost breakdown for cheapest available provider available_providers = get_available_providers_for_gpu(gpu_spec) if available_providers: cheapest_provider = min(available_providers, key=lambda x: all_costs[x]['totals']['total_cost']) st.subheader(f"💡 Most Cost-Effective Option: {cheapest_provider}") if cheapest_provider == 'On-Premise': st.success(f"✅ **On-Premise deployment offers the lowest cost** with {selected_gpu}") st.info("💰 **Note**: On-premise costs assume 3-year hardware amortization. Initial capex and datacenter setup costs are not included in hourly rates.") else: st.info(f"✅ **{selected_gpu} is available on {cheapest_provider}**") cheapest_costs = all_costs[cheapest_provider] col1, col2, col3 = st.columns(3) with col1: st.metric( "Platform Infrastructure", f"${cheapest_costs['totals']['platform_cost']:.2f}", help="Kubernetes nodes (including VectorDB), networking, storage, management" ) with col2: st.metric( "GPU Infrastructure", f"${cheapest_costs['totals']['gpu_cost']:.2f}", help=f"{gpu_requirements['total_gpus_needed']} x {selected_gpu}" ) with col3: # Calculate savings compared to most expensive available provider if len(available_providers) > 1: most_expensive_available = max(available_providers, key=lambda x: all_costs[x]['totals']['total_cost']) savings = all_costs[most_expensive_available]['totals']['total_cost'] - cheapest_costs['totals']['total_cost'] savings_pct = (savings / all_costs[most_expensive_available]['totals']['total_cost']) * 100 st.metric( "Potential Savings", f"${savings:.2f}", help=f"Savings compared to {most_expensive_available} ({savings_pct:.1f}%)" ) else: st.metric( "Provider Status", "Only Option", help="This is the only provider with the selected GPU available" ) # Cloud vs On-Premise comparison if both are available if 'On-Premise' in available_providers and len(available_providers) > 1: st.subheader("☁️ Cloud vs 🏢 On-Premise Comparison") onprem_cost = all_costs['On-Premise']['totals']['total_cost'] cloud_providers = [p for p in available_providers if p != 'On-Premise'] comparison_data = [] for provider in ['On-Premise'] + cloud_providers: comparison_data.append({ 'Deployment Type': 'On-Premise' if provider == 'On-Premise' else 'Cloud', 'Provider': provider, 'Total Cost': all_costs[provider]['totals']['total_cost'], 'Platform Cost': all_costs[provider]['totals']['platform_cost'], 'GPU Cost': all_costs[provider]['totals']['gpu_cost'] }) comp_df = pd.DataFrame(comparison_data) # Create grouped bar chart fig_comparison = go.Figure() fig_comparison.add_trace(go.Bar( name='Platform Cost', x=comp_df['Provider'], y=comp_df['Platform Cost'], marker_color='lightblue' )) fig_comparison.add_trace(go.Bar( name='GPU Cost', x=comp_df['Provider'], y=comp_df['GPU Cost'], marker_color='orange' )) fig_comparison.update_layout( title='Cost Breakdown: On-Premise vs Cloud', xaxis_title='Provider', yaxis_title='Cost (USD)', barmode='stack' ) st.plotly_chart(fig_comparison, use_container_width=True) # Calculate average cloud cost avg_cloud_cost = sum([all_costs[p]['totals']['total_cost'] for p in cloud_providers]) / len(cloud_providers) cloud_savings = avg_cloud_cost - onprem_cost cloud_savings_pct = (cloud_savings / avg_cloud_cost) * 100 if cloud_savings > 0: st.success(f"💰 **On-Premise Savings**: ${cloud_savings:.2f} ({cloud_savings_pct:.1f}%) compared to average cloud cost over {time_period}") else: st.info(f"☁️ **Cloud is more cost-effective** for this configuration over {time_period}") else: st.error(f"❌ **No Providers Available**: The selected GPU ({selected_gpu}) is not available on any deployment option") st.warning("**Recommendation**: Please select a different GPU model that is available") # Show which GPUs are available on which providers st.subheader("🔍 GPU Availability by Provider") availability_data = [] for gpu_name, gpu_data in GPUS.items(): available_on = get_available_providers_for_gpu(gpu_data) availability_data.append({ 'GPU Model': gpu_name, 'Memory': f"{gpu_data['memory']} GB", 'Available On': ', '.join(available_on) if available_on else 'None', 'Deployment Options': len(available_on) }) availability_df = pd.DataFrame(availability_data) availability_df = availability_df.sort_values('Deployment Options', ascending=False) st.dataframe(availability_df, use_container_width=True) with tab4: st.subheader("Performance Analysis & Scaling") # Performance metrics col1, col2, col3 = st.columns(3) with col1: st.metric( "Total System TPS", f"{gpu_requirements['total_system_tps']:.0f}", help="Combined throughput of all GPUs" ) with col2: st.metric( "Conversation Capacity", f"{gpu_requirements['max_conversations_per_minute']:.0f}/min", help="Maximum conversations the system can handle" ) with col3: capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100 st.metric( "Capacity Headroom", f"{headroom_percentage:.1f}%", delta=f"{capacity_headroom:.0f} conv/min available", help="Available capacity beyond current target load" ) # Scaling analysis st.subheader("Scaling Analysis") # Create scaling scenarios scaling_scenarios = [0.5, 1.0, 1.5, 2.0, 3.0, 5.0] scaling_data = [] for multiplier in scaling_scenarios: scaled_conversations = int(conversations_per_minute * multiplier) scaled_gpu_reqs = calculate_gpu_requirements( scaled_conversations, tokens_per_conversation, model_spec, gpu_spec, precision_bytes ) scaling_data.append({ 'Load Multiplier': f"{multiplier}x", 'Conversations/Min': scaled_conversations, 'Logical GPUs': scaled_gpu_reqs['total_gpus_needed'], 'Allocated GPUs': scaled_gpu_reqs['actual_gpus_allocated'], 'GPU Nodes': f"{scaled_gpu_reqs['best_config']['num_nodes']}×{scaled_gpu_reqs['best_config']['gpus_per_node']}" if scaled_gpu_reqs['best_config'] else 'N/A', 'System Capacity': f"{scaled_gpu_reqs['max_conversations_per_minute']:.0f}", 'Headroom %': f"{((scaled_gpu_reqs['max_conversations_per_minute'] - scaled_conversations) / scaled_gpu_reqs['max_conversations_per_minute'] * 100):.1f}%" }) scaling_df = pd.DataFrame(scaling_data) st.dataframe( scaling_df, use_container_width=True, hide_index=True, column_config={ "Load Multiplier": st.column_config.TextColumn("Load", width="small"), "Conversations/Min": st.column_config.NumberColumn("Conv/Min", width="small"), "Logical GPUs": st.column_config.NumberColumn("Logical", width="small"), "Allocated GPUs": st.column_config.NumberColumn("Allocated", width="small"), "GPU Nodes": st.column_config.TextColumn("GPU Nodes", width="medium"), "System Capacity": st.column_config.TextColumn("Capacity", width="medium"), "Headroom %": st.column_config.TextColumn("Headroom %", width="small") } ) # Scaling visualization fig_scaling = go.Figure() # Add lines for both logical and allocated GPUs fig_scaling.add_trace(go.Scatter( x=[float(x.replace('x', '')) for x in scaling_df['Load Multiplier']], y=scaling_df['Logical GPUs'].astype(int), mode='lines+markers', name='Logical GPUs Required', line=dict(color='blue', dash='dash') )) fig_scaling.add_trace(go.Scatter( x=[float(x.replace('x', '')) for x in scaling_df['Load Multiplier']], y=scaling_df['Allocated GPUs'].astype(int), mode='lines+markers', name='Allocated GPUs (Actual)', line=dict(color='red') )) fig_scaling.update_layout( title='GPU Scaling Requirements (Logical vs Allocated)', xaxis_title='Load Multiplier', yaxis_title='Number of GPUs' ) st.plotly_chart(fig_scaling, use_container_width=True) # Application scaling analysis st.subheader("Application Scaling Analysis") app_scaling_scenarios = [4, 8, 12, 16, 20, 24, 32, 40] app_scaling_data = [] for apps in app_scaling_scenarios: deploy_nodes = math.ceil(apps / 4) app_scaling_data.append({ 'Apps per Tenant': apps, 'Total Apps': apps * num_tenants, 'Deploy Nodes per Tenant': deploy_nodes, 'Total Deploy Nodes': deploy_nodes * num_tenants, 'Deploy Node Ratio': f"1:{4 if apps >= 4 else apps}" }) app_scaling_df = pd.DataFrame(app_scaling_data) st.dataframe( app_scaling_df, use_container_width=True, hide_index=True, column_config={ "Apps per Tenant": st.column_config.NumberColumn("Apps/Tenant", width="small"), "Total Apps": st.column_config.NumberColumn("Total Apps", width="small"), "Deploy Nodes per Tenant": st.column_config.NumberColumn("Deploy/Tenant", width="small"), "Total Deploy Nodes": st.column_config.NumberColumn("Total Deploy", width="medium"), "Deploy Node Ratio": st.column_config.TextColumn("Ratio", width="small") } ) # App scaling visualization fig_app_scaling = px.line( app_scaling_df, x='Apps per Tenant', y='Total Deploy Nodes', title='Deployment Nodes Scaling with Application Count', markers=True ) st.plotly_chart(fig_app_scaling, use_container_width=True) with tab5: st.subheader("Technical Specifications") # Model specifications st.markdown("### 🤖 LLM Model Specifications") model_specs_data = [{ 'Property': 'Model Name', 'Value': selected_model }, { 'Property': 'Organization', 'Value': model_spec['org'] }, { 'Property': 'Total Parameters', 'Value': f"{model_spec['params']}B" }, { 'Property': 'Active Parameters', 'Value': f"{model_spec['active_params']}B" }, { 'Property': 'Max Context Length', 'Value': f"{model_spec['max_context']:,} tokens" }, { 'Property': 'Base TPS', 'Value': f"{model_spec['base_tps']:,}" }, { 'Property': 'License', 'Value': model_spec['license'] }, { 'Property': 'Architecture Type', 'Value': 'Mixture of Experts (MoE)' if model_spec['params'] != model_spec['active_params'] else 'Dense Model' }] model_specs_df = pd.DataFrame(model_specs_data) st.dataframe(model_specs_df, use_container_width=True) # GPU specifications st.markdown("### 🖥️ GPU Specifications") gpu_specs_data = [{ 'Property': 'GPU Model', 'Value': selected_gpu }, { 'Property': 'Memory Capacity', 'Value': f"{gpu_spec['memory']} GB" }, { 'Property': 'Compute Capability', 'Value': gpu_spec['compute'] }, { 'Property': 'TPS Range', 'Value': f"{gpu_spec['tps_min']:,} - {gpu_spec['tps_max']:,}" }, { 'Property': 'Efficiency Tier', 'Value': gpu_spec['efficiency_tier'] }, { 'Property': 'Model Precision', 'Value': precision }] gpu_specs_df = pd.DataFrame(gpu_specs_data) st.dataframe(gpu_specs_df, use_container_width=True) # Platform specifications st.markdown("### 🗏 Platform Infrastructure Specifications") platform_specs_data = [{ 'Component': 'Standard K8s Nodes', 'Specification': f"{infrastructure['totals']['total_standard_nodes']} nodes × 8 vCPUs × 32GB RAM" }, { 'Component': 'VectorDB Nodes', 'Specification': f"{infrastructure['totals']['total_vectordb_nodes']} nodes × 16 vCPUs × 64GB RAM" }, { 'Component': 'GPU Nodes', 'Specification': f"{gpu_requirements['actual_gpus_allocated']} × {selected_gpu} ({gpu_requirements['best_config']['num_nodes']} nodes × {gpu_requirements['best_config']['gpus_per_node']} GPUs)" if gpu_requirements['best_config'] else f"{gpu_requirements['total_gpus_needed']} × {selected_gpu}" }, { 'Component': 'Total CPU Cores', 'Specification': f"{infrastructure['totals']['total_cpu']} cores" }, { 'Component': 'Total RAM', 'Specification': f"{infrastructure['totals']['total_ram']} GB" }, { 'Component': 'Total GPU Memory', 'Specification': f"{gpu_requirements['actual_gpus_allocated'] * gpu_spec['memory']} GB" }, { 'Component': 'Applications per Tenant', 'Specification': f"{apps_per_tenant} apps × {num_tenants} tenants = {infrastructure['totals']['total_apps']} total apps" }, { 'Component': 'Deployment Nodes per Tenant', 'Specification': f"{infrastructure['totals']['deploy_nodes_per_tenant']} node(s) (1 node per 4 apps)" }] platform_specs_df = pd.DataFrame(platform_specs_data) st.dataframe(platform_specs_df, use_container_width=True) # Provider Pricing Configuration Summary st.markdown("### 💰 Provider Pricing Configuration") # Create tabs for each provider price_tab1, price_tab2, price_tab3, price_tab4 = st.tabs(["AWS", "Azure", "GCP", "On-Premise"]) with price_tab1: aws_config_data = [{ 'Cost Component': 'Standard Compute Node', 'Specification': 'm5.2xlarge (8 vCPU, 32GB)', 'Cost per Hour': f"${aws_standard_node:.3f}", 'Status': '✏️ Custom' if aws_standard_node != 0.384 else '✅ Default' }, { 'Cost Component': 'VectorDB Node', 'Specification': 'm5.4xlarge (16 vCPU, 64GB)', 'Cost per Hour': f"${aws_vectordb_node:.3f}", 'Status': '✏️ Custom' if aws_vectordb_node != 0.768 else '✅ Default' }, { 'Cost Component': 'Jump Host', 'Specification': 'm5.large (2 vCPU, 8GB)', 'Cost per Hour': f"${aws_jump_host:.3f}", 'Status': '✏️ Custom' if aws_jump_host != 0.096 else '✅ Default' }, { 'Cost Component': 'EKS Management', 'Specification': 'Managed Kubernetes', 'Cost per Hour': f"${aws_k8s_management:.3f}", 'Status': '✏️ Custom' if aws_k8s_management != 0.10 else '✅ Default' }, { 'Cost Component': 'H200 141GB GPU', 'Specification': 'Flagship+ GPU', 'Cost per Hour': f"${aws_h200:.2f}", 'Status': '✏️ Custom' if aws_h200 != 15.70 else '✅ Default' }, { 'Cost Component': 'H100 80GB GPU', 'Specification': 'Flagship GPU', 'Cost per Hour': f"${aws_h100:.2f}", 'Status': '✏️ Custom' if aws_h100 != 6.01 else '✅ Default' }, { 'Cost Component': 'A100 80GB GPU', 'Specification': 'Excellent GPU', 'Cost per Hour': f"${aws_a100_80:.2f}", 'Status': '✏️ Custom' if aws_a100_80 != 3.43 else '✅ Default' }, { 'Cost Component': 'A100 40GB GPU', 'Specification': 'Good GPU', 'Cost per Hour': f"${aws_a100_40:.2f}", 'Status': '✏️ Custom' if aws_a100_40 != 2.75 else '✅ Default' }, { 'Cost Component': 'L40S GPU', 'Specification': 'Very Good GPU', 'Cost per Hour': f"${aws_l40s:.2f}", 'Status': '✏️ Custom' if aws_l40s != 1.67 else '✅ Default' }] aws_config_df = pd.DataFrame(aws_config_data) st.dataframe(aws_config_df, use_container_width=True) with price_tab2: azure_config_data = [{ 'Cost Component': 'Standard Compute Node', 'Specification': 'Standard_D8s_v3 (8 vCPU, 32GB)', 'Cost per Hour': f"${azure_standard_node:.3f}", 'Status': '✏️ Custom' if azure_standard_node != 0.384 else '✅ Default' }, { 'Cost Component': 'VectorDB Node', 'Specification': 'Standard_D16s_v3 (16 vCPU, 64GB)', 'Cost per Hour': f"${azure_vectordb_node:.3f}", 'Status': '✏️ Custom' if azure_vectordb_node != 0.768 else '✅ Default' }, { 'Cost Component': 'Jump Host', 'Specification': 'Standard_D2s_v3 (2 vCPU, 8GB)', 'Cost per Hour': f"${azure_jump_host:.3f}", 'Status': '✏️ Custom' if azure_jump_host != 0.096 else '✅ Default' }, { 'Cost Component': 'AKS Management', 'Specification': 'Managed Kubernetes (Free)', 'Cost per Hour': f"${azure_k8s_management:.3f}", 'Status': '✏️ Custom' if azure_k8s_management != 0.0 else '✅ Default' }, { 'Cost Component': 'H200 141GB GPU', 'Specification': 'Flagship+ GPU', 'Cost per Hour': f"${azure_h200:.2f}", 'Status': '✏️ Custom' if azure_h200 != 12.29 else '✅ Default' }, { 'Cost Component': 'H100 80GB GPU', 'Specification': 'Flagship GPU', 'Cost per Hour': f"${azure_h100:.2f}", 'Status': '✏️ Custom' if azure_h100 != 6.98 else '✅ Default' }, { 'Cost Component': 'A100 80GB GPU', 'Specification': 'Excellent GPU', 'Cost per Hour': f"${azure_a100_80:.2f}", 'Status': '✏️ Custom' if azure_a100_80 != 3.67 else '✅ Default' }, { 'Cost Component': 'A100 40GB GPU', 'Specification': 'Good GPU', 'Cost per Hour': f"${azure_a100_40:.2f}", 'Status': '✏️ Custom' if azure_a100_40 != 3.67 else '✅ Default' }] azure_config_df = pd.DataFrame(azure_config_data) st.dataframe(azure_config_df, use_container_width=True) with price_tab3: gcp_config_data = [{ 'Cost Component': 'Standard Compute Node', 'Specification': 'n1-standard-8 (8 vCPU, 30GB)', 'Cost per Hour': f"${gcp_standard_node:.3f}", 'Status': '✏️ Custom' if gcp_standard_node != 0.379 else '✅ Default' }, { 'Cost Component': 'VectorDB Node', 'Specification': 'n1-standard-16 (16 vCPU, 60GB)', 'Cost per Hour': f"${gcp_vectordb_node:.3f}", 'Status': '✏️ Custom' if gcp_vectordb_node != 0.758 else '✅ Default' }, { 'Cost Component': 'Jump Host', 'Specification': 'e2-medium (2 vCPU, 8GB)', 'Cost per Hour': f"${gcp_jump_host:.3f}", 'Status': '✏️ Custom' if gcp_jump_host != 0.067 else '✅ Default' }, { 'Cost Component': 'GKE Management', 'Specification': 'Managed Kubernetes', 'Cost per Hour': f"${gcp_k8s_management:.3f}", 'Status': '✏️ Custom' if gcp_k8s_management != 0.10 else '✅ Default' }, { 'Cost Component': 'H100 80GB GPU', 'Specification': 'Flagship GPU', 'Cost per Hour': f"${gcp_h100:.2f}", 'Status': '✏️ Custom' if gcp_h100 != 11.06 else '✅ Default' }, { 'Cost Component': 'A100 80GB GPU', 'Specification': 'Excellent GPU', 'Cost per Hour': f"${gcp_a100_80:.2f}", 'Status': '✏️ Custom' if gcp_a100_80 != 2.48 else '✅ Default' }, { 'Cost Component': 'A100 40GB GPU', 'Specification': 'Good GPU', 'Cost per Hour': f"${gcp_a100_40:.2f}", 'Status': '✏️ Custom' if gcp_a100_40 != 1.46 else '✅ Default' }] gcp_config_df = pd.DataFrame(gcp_config_data) st.dataframe(gcp_config_df, use_container_width=True) with price_tab4: onprem_config_data = [{ 'Cost Component': 'Standard Compute Node', 'Specification': '8 vCPU, 32GB RAM', 'Cost per Hour': f"${onprem_standard_node:.3f}", 'Status': '✏️ Custom' if onprem_standard_node != 0.192 else '✅ Default' }, { 'Cost Component': 'VectorDB Node', 'Specification': '16 vCPU, 64GB RAM', 'Cost per Hour': f"${onprem_vectordb_node:.3f}", 'Status': '✏️ Custom' if onprem_vectordb_node != 0.384 else '✅ Default' }, { 'Cost Component': 'Jump Host', 'Specification': '2 vCPU, 8GB RAM', 'Cost per Hour': f"${onprem_jump_host:.3f}", 'Status': '✏️ Custom' if onprem_jump_host != 0.048 else '✅ Default' }, { 'Cost Component': 'GPU Pricing', 'Specification': f'{onprem_gpu_multiplier*100:.0f}% of AWS pricing', 'Cost per Hour': f"${GPUS[selected_gpu]['pricing']['on-premise']:.2f} (for {selected_gpu})", 'Status': '✏️ Custom' if onprem_gpu_multiplier != 0.55 else '✅ Default' }, { 'Cost Component': 'K8s Management', 'Specification': 'Self-managed operational cost', 'Cost per Hour': f"${onprem_k8s_management:.3f}", 'Status': '✏️ Custom' if onprem_k8s_management != 0.05 else '✅ Default' }, { 'Cost Component': 'Network Infrastructure', 'Specification': 'Switches, routers, firewalls', 'Cost per Hour': f"${onprem_network:.3f}", 'Status': '✏️ Custom' if onprem_network != 0.020 else '✅ Default' }, { 'Cost Component': 'Storage SAN/NAS', 'Specification': 'Per GB per month', 'Cost per Hour': f"${onprem_storage_per_gb:.3f}/GB/month", 'Status': '✏️ Custom' if onprem_storage_per_gb != 0.05 else '✅ Default' }, { 'Cost Component': 'Hardware Load Balancer', 'Specification': 'F5/Citrix ADC amortized', 'Cost per Hour': f"${onprem_load_balancer:.3f}", 'Status': '✏️ Custom' if onprem_load_balancer != 0.010 else '✅ Default' }, { 'Cost Component': 'Power & Cooling', 'Specification': 'Datacenter utilities', 'Cost per Hour': f"${onprem_power_cooling:.3f}", 'Status': '✏️ Custom' if onprem_power_cooling != 0.030 else '✅ Default' }, { 'Cost Component': 'Datacenter Space', 'Specification': 'Rack space and facilities', 'Cost per Hour': f"${onprem_datacenter_space:.3f}", 'Status': '✏️ Custom' if onprem_datacenter_space != 0.015 else '✅ Default' }, { 'Cost Component': 'Maintenance & Support', 'Specification': 'Vendor support contracts', 'Cost per Hour': f"${onprem_maintenance:.3f}", 'Status': '✏️ Custom' if onprem_maintenance != 0.025 else '✅ Default' }] onprem_config_df = pd.DataFrame(onprem_config_data) st.dataframe(onprem_config_df, use_container_width=True) st.markdown(""" **💡 Configuration Tips:** - Adjust pricing in the sidebar under "Cloud Provider Pricing (Optional)" - Default values based on public pricing as of 2024/2025 - Customize based on your actual contract rates, discounts, or negotiated pricing - All calculations update automatically when values are changed - Click "🔄 Reset All Pricing to Defaults" in sidebar to restore original values """) # VM Types Summary st.markdown("### 🖥️ Deployment Options Summary") deployment_options_data = [] for provider in CLOUD_PRICING.keys(): pricing = CLOUD_PRICING[provider] deployment_options_data.append({ 'Provider': provider, 'Standard Node': pricing['description'], 'VectorDB Node': pricing['vectordb_node']['instance_type'], 'Jump Host': pricing['jump_host']['instance_type'], 'Managed K8s': pricing['name'] }) deployment_df = pd.DataFrame(deployment_options_data) st.dataframe(deployment_df, use_container_width=True) # Recommendations section st.header("💡 Recommendations & Insights") col1, col2 = st.columns(2) with col1: st.subheader("🎯 Performance Recommendations") if gpu_requirements['bottleneck'] == 'Memory': st.info("💾 **Memory-bound workload**: Consider using INT8 or INT4 quantization to reduce memory requirements") else: st.info("⚡ **Throughput-bound workload**: Current memory is sufficient, focus on GPU count for throughput") capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100 if headroom_percentage < 20: st.warning(f"🚨 **Low headroom** ({headroom_percentage:.1f}%): System near capacity. Consider adding more GPUs or optimizing workload distribution") elif headroom_percentage > 70: st.success(f"✅ **High headroom** ({headroom_percentage:.1f}%): System has significant capacity for growth") else: st.info(f"📊 **Balanced headroom** ({headroom_percentage:.1f}%): Good balance between capacity and resource efficiency") # Application deployment recommendations if apps_per_tenant > 12: st.warning(f"📦 **High app density**: {apps_per_tenant} apps per tenant requires {infrastructure['totals']['deploy_nodes_per_tenant']} deployment nodes. Consider application consolidation") elif apps_per_tenant <= 4: st.success(f"✅ **Efficient deployment**: Only 1 deployment node needed for {apps_per_tenant} apps per tenant") else: st.info(f"📊 **Moderate app density**: {infrastructure['totals']['deploy_nodes_per_tenant']} deployment nodes for {apps_per_tenant} apps per tenant") with col2: st.subheader("💰 Cost Optimization") available_providers = get_available_providers_for_gpu(gpu_spec) if len(available_providers) >= 2: cheapest_provider = min(available_providers, key=lambda x: all_costs[x]['totals']['total_cost']) most_expensive_provider = max(available_providers, key=lambda x: all_costs[x]['totals']['total_cost']) savings = all_costs[most_expensive_provider]['totals']['total_cost'] - all_costs[cheapest_provider]['totals']['total_cost'] savings_percentage = (savings / all_costs[most_expensive_provider]['totals']['total_cost']) * 100 if cheapest_provider == 'On-Premise': st.success(f"💡 **Recommended Option**: On-Premise Deployment") st.info(f"💰 **Cost Advantage**: ${savings:.2f} ({savings_percentage:.1f}%) savings compared to {most_expensive_provider}") st.warning("⚠️ **Consider**: Initial capex, datacenter readiness, and operational expertise for on-premise") else: st.success(f"💡 **Recommended Provider**: {cheapest_provider}") st.info(f"💰 **Potential Savings**: ${savings:.2f} ({savings_percentage:.1f}%) compared to {most_expensive_provider}") # Cost distribution insight cheapest_costs = all_costs[cheapest_provider] platform_percentage = (cheapest_costs['totals']['platform_cost'] / cheapest_costs['totals']['total_cost']) * 100 gpu_percentage = (cheapest_costs['totals']['gpu_cost'] / cheapest_costs['totals']['total_cost']) * 100 if gpu_percentage > 70: st.warning("🖥️ **GPU-heavy costs**: Consider optimizing model size or using more efficient GPUs") else: st.info(f"⚖️ **Balanced infrastructure**: Platform ({platform_percentage:.0f}%) vs GPU ({gpu_percentage:.0f}%)") elif len(available_providers) == 1: available_provider = available_providers[0] st.success(f"💡 **Available Option**: {available_provider}") if available_provider == 'On-Premise': st.info("🏢 On-premise is your only deployment option for this GPU") else: st.warning("⚠️ **Limited Options**: Only one provider has the selected GPU available") # Show cost distribution for the only available provider provider_costs = all_costs[available_provider] platform_percentage = (provider_costs['totals']['platform_cost'] / provider_costs['totals']['total_cost']) * 100 gpu_percentage = (provider_costs['totals']['gpu_cost'] / provider_costs['totals']['total_cost']) * 100 st.info(f"📊 **Cost Distribution**: Platform ({platform_percentage:.0f}%) vs GPU ({gpu_percentage:.0f}%)") else: st.error("❌ **No Available Options**: Selected GPU is not available on any deployment option") st.warning("**Action Required**: Please select a different GPU model") # Show alternative GPUs st.markdown("**💡 Suggested Alternatives:**") alternatives = [] for gpu_name, gpu_data in GPUS.items(): available_on = get_available_providers_for_gpu(gpu_data) if available_on: alternatives.append(f"• **{gpu_name}** - Available on: {', '.join(available_on)}") if alternatives: for alt in alternatives[:3]: # Show top 3 alternatives st.markdown(alt) # Infrastructure Summary Box st.header("📋 Infrastructure Summary") summary_col1, summary_col2, summary_col3 = st.columns(3) with summary_col1: st.markdown("### Platform Infrastructure") st.markdown(f""" - **Tenants**: {num_tenants} - **Apps per Tenant**: {apps_per_tenant} - **Total Applications**: {infrastructure['totals']['total_apps']} - **Standard Nodes**: {infrastructure['totals']['total_standard_nodes']} (8 vCPU, 32GB) - **VectorDB Nodes**: {infrastructure['totals']['total_vectordb_nodes']} (16 vCPU, 64GB) - **Total Platform Nodes**: {infrastructure['totals']['total_nodes']} """) with summary_col2: st.markdown("### GPU Infrastructure") st.markdown(f""" - **Model**: {selected_model} - **GPU Type**: {selected_gpu} - **Precision**: {precision} - **GPUs Required**: {gpu_requirements['total_gpus_needed']} - **GPUs Allocated**: {gpu_requirements['actual_gpus_allocated']} - **GPU Configuration**: {gpu_requirements['best_config']['num_nodes']} nodes × {gpu_requirements['best_config']['gpus_per_node']} GPUs """) with summary_col3: st.markdown("### Performance Metrics") capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100 st.markdown(f""" - **Target Load**: {conversations_per_minute} conv/min - **Max Capacity**: {gpu_requirements['max_conversations_per_minute']:.0f} conv/min - **Capacity Headroom**: {headroom_percentage:.1f}% - **Bottleneck**: {gpu_requirements['bottleneck']} - **Total TPS**: {gpu_requirements['total_system_tps']:.0f} - **Tokens/Conv**: {tokens_per_conversation} """) if __name__ == "__main__": create_comprehensive_dashboard()