cc / src /streamlit_app.py
subhrajit-mohanty's picture
Update src/streamlit_app.py
bf4e1ad verified
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import math
from plotly.subplots import make_subplots
import json
from datetime import datetime
# Configure Streamlit for better performance
st.set_page_config(
page_title="Katonic Multitenant Infrastructure Calculator",
page_icon="🚀",
layout="wide",
initial_sidebar_state="expanded"
)
# Cloud provider and On-Premise pricing data (per hour in USD)
CLOUD_PRICING = {
'On-Premise': {
'name': 'On-Premise Datacenter',
'cost_per_node_hour': 0.192, # ~50% of cloud (amortized hardware + power + cooling over 3 years)
'managed_k8s_cost': 0.05, # Self-managed K8s operational cost (admin time, monitoring tools)
'description': 'Dell PowerEdge R640 / HPE DL360 equivalent',
'specs': '8 vCPUs, 32GB RAM',
'vectordb_node': {
'instance_type': 'Dell PowerEdge R740 / HPE DL380 equivalent',
'cost_per_hour': 0.384, # ~50% of cloud (high-memory server amortized)
'specs': '16 vCPUs, 64GB RAM'
},
'jump_host': {
'instance_type': 'Dell PowerEdge R440 / HPE DL20 equivalent',
'cost_per_hour': 0.048, # ~50% of cloud (small server amortized)
'specs': '2 vCPUs, 8GB RAM'
},
'additional_services': {
'Network_Infrastructure': {'cost_per_hour': 0.020, 'description': 'Switches, routers, firewalls (amortized)'},
'Storage_SAN': {'cost_per_gb_month': 0.05, 'description': 'SAN/NAS storage (1TB base, amortized)'},
'Hardware_Load_Balancer': {'cost_per_hour': 0.010, 'description': 'F5/Citrix ADC (amortized)'},
'Power_Cooling': {'cost_per_hour': 0.030, 'description': 'Datacenter power (0.1kW/server) and cooling'},
'Datacenter_Space': {'cost_per_hour': 0.015, 'description': 'Rack space and facilities costs'},
'Maintenance_Support': {'cost_per_hour': 0.025, 'description': 'Hardware maintenance and vendor support contracts'}
},
'gpu_pricing_multiplier': 0.55, # On-prem GPU costs are ~55% of cloud (hardware amortization + power)
'notes': 'Costs include: hardware amortization (3-year lifecycle), power (~$0.10/kWh), cooling (1:1 ratio), rack space, network infrastructure, storage, and maintenance. Assumes enterprise datacenter with N+1 redundancy. Does NOT include: initial capex, datacenter construction, staff salaries (covered in K8s management cost).'
},
'AWS': {
'name': 'Amazon EKS',
'cost_per_node_hour': 0.384,
'managed_k8s_cost': 0.10,
'description': 'm5.2xlarge instances',
'specs': '8 vCPUs, 32GB RAM',
'vectordb_node': {
'instance_type': 'm5.4xlarge',
'cost_per_hour': 0.768,
'specs': '16 vCPUs, 64GB RAM'
},
'jump_host': {
'instance_type': 'm5.large',
'cost_per_hour': 0.096,
'specs': '2 vCPUs, 8GB RAM'
},
'additional_services': {
'VPC': {'cost_per_hour': 0.0, 'description': 'Virtual Private Cloud (Free)'},
'EBS': {'cost_per_gb_month': 0.10, 'description': 'Elastic Block Store (1TB expandable)'},
'ELB': {'cost_per_hour': 0.025, 'description': 'Elastic Load Balancer'},
'EIP': {'cost_per_hour': 0.005, 'description': 'Elastic IP Address'}
}
},
'Azure': {
'name': 'Azure Kubernetes Service',
'cost_per_node_hour': 0.384,
'managed_k8s_cost': 0.0,
'description': 'Standard_D8s_v3 instances',
'specs': '8 vCPUs, 32GB RAM',
'vectordb_node': {
'instance_type': 'Standard_D16s_v3',
'cost_per_hour': 0.768,
'specs': '16 vCPUs, 64GB RAM'
},
'jump_host': {
'instance_type': 'Standard_D2s_v3',
'cost_per_hour': 0.096,
'specs': '2 vCPUs, 8GB RAM'
},
'additional_services': {
'VNet': {'cost_per_hour': 0.0, 'description': 'Virtual Network (Free)'},
'Managed_Disks': {'cost_per_gb_month': 0.10, 'description': 'Managed Disks (1TB expandable)'},
'Load_Balancer': {'cost_per_hour': 0.025, 'description': 'Azure Load Balancer'},
'Public_IP': {'cost_per_hour': 0.005, 'description': 'Public IP Address'}
}
},
'GCP': {
'name': 'Google Kubernetes Engine',
'cost_per_node_hour': 0.379,
'managed_k8s_cost': 0.10,
'description': 'n1-standard-8 instances',
'specs': '8 vCPUs, 30GB RAM',
'vectordb_node': {
'instance_type': 'n1-standard-16',
'cost_per_hour': 0.758,
'specs': '16 vCPUs, 60GB RAM'
},
'jump_host': {
'instance_type': 'e2-medium',
'cost_per_hour': 0.067,
'specs': '2 vCPUs, 8GB RAM'
},
'additional_services': {
'VPC': {'cost_per_hour': 0.0, 'description': 'Virtual Private Cloud (Free)'},
'Persistent_Disk': {'cost_per_gb_month': 0.10, 'description': 'Persistent Disk (1TB expandable)'},
'Load_Balancer': {'cost_per_hour': 0.025, 'description': 'Cloud Load Balancing'},
'Static_IP': {'cost_per_hour': 0.004, 'description': 'Static External IP'},
'Cloud_Storage': {'cost_per_gb_month': 0.020, 'description': 'GCS Bucket (Optional)'},
'Filestore': {'cost_per_gb_month': 0.20, 'description': 'Filestore (depends on usage)'}
}
}
}
# Production-grade model specifications
MODELS = {
"Llama 4 Maverick": {
"params": 400,
"active_params": 17,
"memory_per_param": 2,
"max_context": 1000000,
"base_tps": 4200,
"org": "Meta",
"license": "Open-weight",
"notes": "Multimodal MoE; 1M context; text, image, code, reasoning"
},
"Llama 4 Scout": {
"params": 109,
"active_params": 17,
"memory_per_param": 2,
"max_context": 10000000,
"base_tps": 4500,
"org": "Meta",
"license": "Open-weight",
"notes": "Multimodal MoE; 10M context; efficient for long-form tasks"
},
"Llama 3.3 70B": {
"params": 70,
"active_params": 70,
"memory_per_param": 2,
"max_context": 128000,
"base_tps": 1800,
"org": "Meta",
"license": "Community (open)",
"notes": "Multilingual; matches Llama 3.1 405B performance"
},
"Qwen2 110B": {
"params": 110,
"active_params": 110,
"memory_per_param": 2,
"max_context": 128000,
"base_tps": 1200,
"org": "Alibaba/Qwen",
"license": "Apache 2.0",
"notes": "Multilingual; top-tier reasoning and coding"
},
"DeepSeek-VL 110B": {
"params": 110,
"active_params": 110,
"memory_per_param": 2,
"max_context": 128000,
"base_tps": 1100,
"org": "DeepSeek AI",
"license": "MIT",
"notes": "Multimodal (vision+language); GPT-4V alternative"
},
"Mixtral 8x22B": {
"params": 141,
"active_params": 39,
"memory_per_param": 2,
"max_context": 65536,
"base_tps": 2800,
"org": "Mistral AI",
"license": "Apache 2.0",
"notes": "Sparse MoE; efficiency leader among MoE models"
}
}
GPUS = {
"H200 141GB": {
"memory": 141,
"compute": 9.0,
"tps_min": 5486,
"tps_max": 18690,
"efficiency_tier": "Flagship+",
"pricing": {
"aws": 15.70,
"azure": 12.29,
"gcp": "NA",
"on-premise": 8.64 # 55% of AWS price (hardware amortization + power)
}
},
"H100 80GB": {
"memory": 80,
"compute": 9.0,
"tps_min": 2400,
"tps_max": 14000,
"efficiency_tier": "Flagship",
"pricing": {
"aws": 6.01,
"azure": 6.98,
"gcp": 11.06,
"on-premise": 3.31 # 55% of AWS price
}
},
"A100 80GB": {
"memory": 80,
"compute": 8.0,
"tps_min": 1100,
"tps_max": 2000,
"efficiency_tier": "Excellent",
"pricing": {
"aws": 3.43,
"azure": 3.67,
"gcp": 2.48,
"on-premise": 1.89 # 55% of AWS price
}
},
"A100 40GB": {
"memory": 40,
"compute": 8.0,
"tps_min": 1000,
"tps_max": 1800,
"efficiency_tier": "Good",
"pricing": {
"aws": 2.75,
"azure": 3.67,
"gcp": 1.46,
"on-premise": 1.51 # 55% of AWS price
}
},
"L40S": {
"memory": 48,
"compute": 8.9,
"tps_min": 4000,
"tps_max": 4768,
"efficiency_tier": "Very Good",
"pricing": {
"aws": 1.67,
"azure": "NA",
"gcp": "NA",
"on-premise": 0.92 # 55% of AWS price
}
}
}
@st.cache_data(show_spinner=False, ttl=300)
def calculate_detailed_infrastructure(num_tenants, apps_per_tenant):
"""Calculate detailed infrastructure requirements with node type breakdown - CACHED"""
# Standard node specs (8 vCPUs, 32GB RAM)
cores_per_node = 8
ram_per_node = 32
# VectorDB node specs (16 vCPUs, 64GB RAM) - Updated as per requirement
vectordb_cores_per_node = 16
vectordb_ram_per_node = 64
# Base infrastructure
base_platform_nodes = 2
# Per tenant requirements
platform_nodes_per_tenant = 1
compute_nodes_per_tenant = 1
vectordb_nodes_per_tenant = 1 # Using 64GB RAM nodes for VectorDB
# Calculate deployment nodes based on apps per tenant
# Every 4 apps need 1 deployment node
deploy_nodes_per_tenant = math.ceil(apps_per_tenant / 4)
# Calculate totals
total_platform_nodes = base_platform_nodes + (platform_nodes_per_tenant * num_tenants)
total_compute_nodes = compute_nodes_per_tenant * num_tenants
total_deploy_nodes = deploy_nodes_per_tenant * num_tenants
total_vectordb_nodes = vectordb_nodes_per_tenant * num_tenants
# Total standard nodes (excluding VectorDB which uses different specs)
total_standard_nodes = total_platform_nodes + total_compute_nodes + total_deploy_nodes
total_nodes = total_standard_nodes + total_vectordb_nodes
# Resource calculations
total_cpu = (total_standard_nodes * cores_per_node) + (total_vectordb_nodes * vectordb_cores_per_node)
total_ram = (total_standard_nodes * ram_per_node) + (total_vectordb_nodes * vectordb_ram_per_node)
# Applications capacity
total_apps = num_tenants * apps_per_tenant
return {
'node_breakdown': {
'Platform Nodes': {
'base': base_platform_nodes,
'tenant': platform_nodes_per_tenant * num_tenants,
'total': total_platform_nodes,
'cores': total_platform_nodes * cores_per_node,
'ram': total_platform_nodes * ram_per_node,
'purpose': 'Tenancy Manager + Tenant platform services',
'node_type': 'Standard (8 vCPU, 32GB RAM)'
},
'Compute Nodes': {
'base': 0,
'tenant': total_compute_nodes,
'total': total_compute_nodes,
'cores': total_compute_nodes * cores_per_node,
'ram': total_compute_nodes * ram_per_node,
'purpose': 'Computational workloads',
'node_type': 'Standard (8 vCPU, 32GB RAM)'
},
'Deploy Nodes': {
'base': 0,
'tenant': total_deploy_nodes,
'total': total_deploy_nodes,
'cores': total_deploy_nodes * cores_per_node,
'ram': total_deploy_nodes * ram_per_node,
'purpose': f'Application deployment ({deploy_nodes_per_tenant} node(s) per {apps_per_tenant} apps)',
'node_type': 'Standard (8 vCPU, 32GB RAM)'
},
'VectorDB Nodes': {
'base': 0,
'tenant': total_vectordb_nodes,
'total': total_vectordb_nodes,
'cores': total_vectordb_nodes * vectordb_cores_per_node,
'ram': total_vectordb_nodes * vectordb_ram_per_node,
'purpose': 'Vector database operations (high memory)',
'node_type': 'High-Memory (16 vCPU, 64GB RAM)'
}
},
'totals': {
'total_nodes': total_nodes,
'total_standard_nodes': total_standard_nodes,
'total_vectordb_nodes': total_vectordb_nodes,
'total_cpu': total_cpu,
'total_ram': total_ram,
'total_apps': total_apps,
'deploy_nodes_per_tenant': deploy_nodes_per_tenant
},
'specs': {
'cores_per_node': cores_per_node,
'ram_per_node': ram_per_node,
'vectordb_cores_per_node': vectordb_cores_per_node,
'vectordb_ram_per_node': vectordb_ram_per_node
}
}
def calculate_model_memory_requirements(model_params, active_params, precision_bytes):
"""Calculate memory requirements for model inference"""
model_memory = model_params * precision_bytes
overhead = model_memory * 0.25
kv_cache = model_memory * 0.1
total_memory = model_memory + overhead + kv_cache
return total_memory
def calculate_model_tps_on_gpu(model_base_tps, model_params, active_params, gpu_spec):
"""Calculate actual TPS for a specific model on a specific GPU"""
effective_params = active_params
reference_params = 70
param_scaling = (reference_params / effective_params) ** 0.7
gpu_tps_min = gpu_spec["tps_min"]
gpu_tps_max = gpu_spec["tps_max"]
actual_tps_min = gpu_tps_min * param_scaling
actual_tps_max = gpu_tps_max * param_scaling
estimated_tps = actual_tps_min + (actual_tps_max - actual_tps_min) * 0.3
return estimated_tps, actual_tps_min, actual_tps_max
def calculate_gpu_node_configurations(total_gpus_needed, gpu_memory_gb, gpu_spec):
"""Calculate GPU node configurations based on standard cloud GPU node sizes"""
configurations = []
# Standard GPU node configurations: 1, 2, 4, 8 GPUs per node
standard_configs = [1, 2, 4, 8]
# Minimum GPUs per node based on memory requirements
min_gpus_per_node = math.ceil(gpu_memory_gb / gpu_spec["memory"])
for gpus_per_node in standard_configs:
# Skip configurations that can't fit the model memory requirement
if gpus_per_node < min_gpus_per_node:
continue
num_nodes = math.ceil(total_gpus_needed / gpus_per_node)
total_gpus_allocated = num_nodes * gpus_per_node
gpu_utilization = (total_gpus_needed / total_gpus_allocated) * 100
gpu_waste = total_gpus_allocated - total_gpus_needed
configurations.append({
'gpus_per_node': gpus_per_node,
'num_nodes': num_nodes,
'total_gpus_allocated': total_gpus_allocated,
'total_gpus_needed': total_gpus_needed,
'utilization': gpu_utilization,
'gpu_waste': gpu_waste,
'meets_memory_req': gpus_per_node >= min_gpus_per_node,
'memory_utilization': (gpu_memory_gb / (gpus_per_node * gpu_spec["memory"])) * 100
})
# If no configurations work (shouldn't happen with proper validation), add all configs
if not configurations:
for gpus_per_node in standard_configs:
num_nodes = math.ceil(total_gpus_needed / gpus_per_node)
total_gpus_allocated = num_nodes * gpus_per_node
gpu_utilization = (total_gpus_needed / total_gpus_allocated) * 100
gpu_waste = total_gpus_allocated - total_gpus_needed
configurations.append({
'gpus_per_node': gpus_per_node,
'num_nodes': num_nodes,
'total_gpus_allocated': total_gpus_allocated,
'total_gpus_needed': total_gpus_needed,
'utilization': gpu_utilization,
'gpu_waste': gpu_waste,
'meets_memory_req': gpus_per_node >= min_gpus_per_node,
'memory_utilization': (gpu_memory_gb / (gpus_per_node * gpu_spec["memory"])) * 100
})
# Sort by utilization (descending) and then by total GPUs (ascending)
configurations.sort(key=lambda x: (-x['utilization'], x['total_gpus_allocated']))
return configurations, min_gpus_per_node
@st.cache_data(show_spinner=False, ttl=300)
def calculate_gpu_requirements(conversations_per_minute, tokens_per_conversation, model_spec, gpu_spec, precision_bytes):
"""Calculate GPU requirements for LLM inference with proper node configurations - CACHED"""
# Calculate throughput requirements
required_tps = (conversations_per_minute * tokens_per_conversation) / 60
# Calculate memory requirements
model_memory_gb = calculate_model_memory_requirements(
model_spec["params"], model_spec["active_params"], precision_bytes
)
# Calculate model performance on GPU
estimated_tps, tps_min, tps_max = calculate_model_tps_on_gpu(
model_spec["base_tps"], model_spec["params"], model_spec["active_params"], gpu_spec
)
# Calculate basic GPU requirements
gpus_needed_memory = math.ceil(model_memory_gb / gpu_spec["memory"])
gpus_needed_throughput = math.ceil(required_tps / estimated_tps)
total_gpus_needed = max(gpus_needed_memory, gpus_needed_throughput, 1)
# Calculate proper GPU node configurations
gpu_configs, min_gpus_per_node = calculate_gpu_node_configurations(
total_gpus_needed, model_memory_gb, gpu_spec
)
# Use the best (most efficient) configuration
best_config = gpu_configs[0] if gpu_configs else None
actual_gpus_allocated = best_config['total_gpus_allocated'] if best_config else total_gpus_needed
return {
'gpus_needed_memory': gpus_needed_memory,
'gpus_needed_throughput': gpus_needed_throughput,
'total_gpus_needed': total_gpus_needed,
'actual_gpus_allocated': actual_gpus_allocated,
'gpu_configurations': gpu_configs,
'best_config': best_config,
'min_gpus_per_node': min_gpus_per_node,
'model_memory_gb': model_memory_gb,
'required_tps': required_tps,
'estimated_tps': estimated_tps,
'tps_range': (tps_min, tps_max),
'total_system_tps': estimated_tps * actual_gpus_allocated,
'max_conversations_per_minute': (estimated_tps * actual_gpus_allocated * 60) / tokens_per_conversation,
'bottleneck': 'Memory' if gpus_needed_memory >= gpus_needed_throughput else 'Throughput'
}
def is_gpu_available_for_provider(provider, gpu_spec):
"""Check if GPU is actually available for a provider (not N/A and has valid pricing)"""
gpu_pricing = gpu_spec.get("pricing", {})
provider_key = provider.lower()
if provider_key not in gpu_pricing:
return False
price = gpu_pricing[provider_key]
return price != "NA" and isinstance(price, (int, float)) and price > 0
def get_available_providers_for_gpu(gpu_spec):
"""Get list of providers that actually have the selected GPU available"""
available_providers = []
for provider in CLOUD_PRICING.keys():
if is_gpu_available_for_provider(provider, gpu_spec):
available_providers.append(provider)
return available_providers
def create_downloadable_cost_report(all_costs, infrastructure, gpu_requirements, model_spec, gpu_spec, selected_model, selected_gpu, num_tenants, apps_per_tenant, conversations_per_minute, tokens_per_conversation, precision, time_period):
"""Create a comprehensive cost report for download"""
report_data = {
'report_metadata': {
'generated_at': datetime.now().isoformat(),
'configuration': {
'tenants': num_tenants,
'apps_per_tenant': apps_per_tenant,
'total_apps': num_tenants * apps_per_tenant,
'model': selected_model,
'gpu': selected_gpu,
'precision': precision,
'conversations_per_minute': conversations_per_minute,
'tokens_per_conversation': tokens_per_conversation,
'time_period': time_period
}
},
'infrastructure_summary': {
'platform_nodes': infrastructure['totals']['total_standard_nodes'],
'vectordb_nodes': infrastructure['totals']['total_vectordb_nodes'],
'total_nodes': infrastructure['totals']['total_nodes'],
'gpu_nodes': gpu_requirements['total_gpus_needed'],
'total_cpu_cores': infrastructure['totals']['total_cpu'],
'total_ram_gb': infrastructure['totals']['total_ram'],
'total_gpu_memory_gb': gpu_requirements['total_gpus_needed'] * gpu_spec['memory'],
'max_conversations_per_minute': gpu_requirements['max_conversations_per_minute']
},
'cost_breakdown_by_provider': {}
}
# Add cost breakdown for each provider
for provider, costs in all_costs.items():
provider_available = is_gpu_available_for_provider(provider, gpu_spec)
report_data['cost_breakdown_by_provider'][provider] = {
'gpu_available': provider_available,
'platform_costs': {
'kubernetes_nodes': costs['platform_costs']['total_node_cost'],
'vectordb_nodes': costs['platform_costs']['vectordb_node_cost'],
'jump_host': costs['platform_costs']['jump_host_cost'],
'additional_services': costs['platform_costs']['additional_services_cost'],
'k8s_management': costs['platform_costs']['k8s_management_cost'],
'platform_total': costs['platform_costs']['platform_total']
},
'gpu_costs': {
'gpu_count': costs['gpu_costs']['gpu_count'],
'gpu_cost_per_hour': costs['gpu_costs']['gpu_cost_per_hour'],
'total_gpu_cost': costs['gpu_costs']['total_gpu_cost'] if provider_available else 'N/A'
},
'totals': {
'platform_cost': costs['totals']['platform_cost'],
'gpu_cost': costs['totals']['gpu_cost'] if provider_available else 'N/A',
'total_cost': costs['totals']['total_cost'] if provider_available else 'N/A',
'cost_per_hour': costs['totals']['cost_per_hour'] if provider_available else 'N/A',
'cost_per_day': costs['totals']['cost_per_day'] if provider_available else 'N/A'
},
'service_details': costs['platform_costs']['service_costs']
}
return report_data
def format_cost_for_display(cost, available=True):
"""Format cost for display, handling N/A cases"""
if not available or cost == 'N/A':
return 'N/A'
return f"${cost:.2f}"
def calculate_detailed_costs(provider, infrastructure, gpu_requirements, gpu_spec, days=30):
"""Calculate detailed costs for both platform and GPU infrastructure"""
pricing = CLOUD_PRICING[provider]
hours = days * 24
# Platform infrastructure costs
node_costs = {}
total_standard_node_cost = 0
total_vectordb_node_cost = 0
for node_type, details in infrastructure['node_breakdown'].items():
if node_type == 'VectorDB Nodes':
# Use special pricing for VectorDB nodes
node_cost = details['total'] * pricing['vectordb_node']['cost_per_hour'] * hours
total_vectordb_node_cost = node_cost
else:
# Use standard pricing for other nodes
node_cost = details['total'] * pricing['cost_per_node_hour'] * hours
total_standard_node_cost += node_cost
node_costs[node_type] = {
'count': details['total'],
'cost': node_cost,
'cores': details['cores'],
'ram': details['ram'],
'node_type': details.get('node_type', 'Standard')
}
total_node_cost = total_standard_node_cost + total_vectordb_node_cost
# Jump Host cost
jump_host_cost = pricing['jump_host']['cost_per_hour'] * hours
# Additional services costs
additional_services_cost = 0
service_costs = {}
for service, details in pricing['additional_services'].items():
if 'cost_per_hour' in details:
service_cost = details['cost_per_hour'] * hours
elif 'cost_per_gb_month' in details:
if 'storage' in service.lower() or 'disk' in service.lower() or 'ebs' in service.lower() or 'san' in service.lower():
service_cost = details['cost_per_gb_month'] * 1024 * (days / 30)
else:
service_cost = 0
else:
service_cost = 0
service_costs[service] = service_cost
additional_services_cost += service_cost
# Kubernetes management cost
k8s_management_cost = pricing['managed_k8s_cost'] * hours
# GPU costs - properly handle N/A cases
gpu_pricing = gpu_spec.get("pricing", {})
gpu_available = is_gpu_available_for_provider(provider, gpu_spec)
gpu_cost_per_hour = 0
gpu_cost = 0
if gpu_available:
gpu_cost_per_hour = gpu_pricing[provider.lower()]
gpu_cost = gpu_requirements['actual_gpus_allocated'] * gpu_cost_per_hour * hours
# Total costs
platform_cost = total_node_cost + jump_host_cost + additional_services_cost + k8s_management_cost
total_cost = platform_cost + gpu_cost if gpu_available else None # None for N/A cases
return {
'platform_costs': {
'node_costs': node_costs,
'total_node_cost': total_node_cost,
'vectordb_node_cost': total_vectordb_node_cost,
'jump_host_cost': jump_host_cost,
'service_costs': service_costs,
'additional_services_cost': additional_services_cost,
'k8s_management_cost': k8s_management_cost,
'platform_total': platform_cost
},
'gpu_costs': {
'gpu_count': gpu_requirements['actual_gpus_allocated'],
'gpu_cost_per_hour': gpu_cost_per_hour,
'total_gpu_cost': gpu_cost,
'gpu_available': gpu_available
},
'totals': {
'platform_cost': platform_cost,
'gpu_cost': gpu_cost,
'total_cost': total_cost,
'cost_per_hour': total_cost / hours if total_cost is not None else None,
'cost_per_day': total_cost / days if total_cost is not None else None,
'gpu_available': gpu_available
}
}
def create_comprehensive_dashboard():
st.set_page_config(
page_title="Katonic Multitenant Infrastructure Calculator",
page_icon="🚀",
layout="wide"
)
st.title("🚀 Katonic Multitenant Infrastructure Calculator")
st.markdown("**Comprehensive infrastructure planning for multi-tenant LLMOPS platforms with GPU-accelerated LLM inference**")
# Sidebar Configuration
with st.sidebar:
st.header("🔧 Configuration")
# Platform Configuration
st.subheader("Platform Settings")
num_tenants = st.slider(
"Number of Tenants",
min_value=1,
max_value=20,
value=3,
help="Each tenant requires dedicated platform, compute, deploy, and VectorDB nodes"
)
apps_per_tenant = st.number_input(
"Apps per Tenant",
min_value=1,
max_value=50,
value=4,
step=1,
help="Number of applications per tenant. Every 4 apps require 1 deployment node"
)
# Cloud Provider Pricing Configuration
st.subheader("Cloud Provider Pricing (Optional)")
# AWS Pricing
with st.expander("☁️ Customize AWS Costs", expanded=False):
st.markdown("**Adjust AWS pricing (per hour in USD)**")
st.markdown("##### Compute Nodes")
aws_standard_node = st.number_input(
"m5.2xlarge (8 vCPU, 32GB)",
min_value=0.01,
max_value=2.00,
value=0.384,
step=0.01,
format="%.3f",
key="aws_standard",
help="Default: $0.384/hr"
)
aws_vectordb_node = st.number_input(
"m5.4xlarge (16 vCPU, 64GB)",
min_value=0.01,
max_value=4.00,
value=0.768,
step=0.01,
format="%.3f",
key="aws_vectordb",
help="Default: $0.768/hr"
)
aws_jump_host = st.number_input(
"m5.large (2 vCPU, 8GB)",
min_value=0.01,
max_value=0.50,
value=0.096,
step=0.01,
format="%.3f",
key="aws_jump",
help="Default: $0.096/hr"
)
aws_k8s_management = st.number_input(
"EKS Management Cost",
min_value=0.0,
max_value=0.50,
value=0.10,
step=0.01,
format="%.3f",
key="aws_k8s",
help="Default: $0.10/hr"
)
st.markdown("##### GPU Pricing")
col1, col2 = st.columns(2)
with col1:
aws_h200 = st.number_input("H200 141GB", value=15.70, step=0.10, format="%.2f", key="aws_h200")
aws_h100 = st.number_input("H100 80GB", value=6.01, step=0.10, format="%.2f", key="aws_h100")
aws_a100_80 = st.number_input("A100 80GB", value=3.43, step=0.10, format="%.2f", key="aws_a100_80")
with col2:
aws_a100_40 = st.number_input("A100 40GB", value=2.75, step=0.10, format="%.2f", key="aws_a100_40")
aws_l40s = st.number_input("L40S", value=1.67, step=0.10, format="%.2f", key="aws_l40s")
# Azure Pricing
with st.expander("☁️ Customize Azure Costs", expanded=False):
st.markdown("**Adjust Azure pricing (per hour in USD)**")
st.markdown("##### Compute Nodes")
azure_standard_node = st.number_input(
"Standard_D8s_v3 (8 vCPU, 32GB)",
min_value=0.01,
max_value=2.00,
value=0.384,
step=0.01,
format="%.3f",
key="azure_standard",
help="Default: $0.384/hr"
)
azure_vectordb_node = st.number_input(
"Standard_D16s_v3 (16 vCPU, 64GB)",
min_value=0.01,
max_value=4.00,
value=0.768,
step=0.01,
format="%.3f",
key="azure_vectordb",
help="Default: $0.768/hr"
)
azure_jump_host = st.number_input(
"Standard_D2s_v3 (2 vCPU, 8GB)",
min_value=0.01,
max_value=0.50,
value=0.096,
step=0.01,
format="%.3f",
key="azure_jump",
help="Default: $0.096/hr"
)
azure_k8s_management = st.number_input(
"AKS Management Cost",
min_value=0.0,
max_value=0.50,
value=0.0,
step=0.01,
format="%.3f",
key="azure_k8s",
help="Default: $0.00/hr (Free tier)"
)
st.markdown("##### GPU Pricing")
col1, col2 = st.columns(2)
with col1:
azure_h200 = st.number_input("H200 141GB", value=12.29, step=0.10, format="%.2f", key="azure_h200")
azure_h100 = st.number_input("H100 80GB", value=6.98, step=0.10, format="%.2f", key="azure_h100")
azure_a100_80 = st.number_input("A100 80GB", value=3.67, step=0.10, format="%.2f", key="azure_a100_80")
with col2:
azure_a100_40 = st.number_input("A100 40GB", value=3.67, step=0.10, format="%.2f", key="azure_a100_40")
# GCP Pricing
with st.expander("☁️ Customize GCP Costs", expanded=False):
st.markdown("**Adjust GCP pricing (per hour in USD)**")
st.markdown("##### Compute Nodes")
gcp_standard_node = st.number_input(
"n1-standard-8 (8 vCPU, 30GB)",
min_value=0.01,
max_value=2.00,
value=0.379,
step=0.01,
format="%.3f",
key="gcp_standard",
help="Default: $0.379/hr"
)
gcp_vectordb_node = st.number_input(
"n1-standard-16 (16 vCPU, 60GB)",
min_value=0.01,
max_value=4.00,
value=0.758,
step=0.01,
format="%.3f",
key="gcp_vectordb",
help="Default: $0.758/hr"
)
gcp_jump_host = st.number_input(
"e2-medium (2 vCPU, 8GB)",
min_value=0.01,
max_value=0.50,
value=0.067,
step=0.01,
format="%.3f",
key="gcp_jump",
help="Default: $0.067/hr"
)
gcp_k8s_management = st.number_input(
"GKE Management Cost",
min_value=0.0,
max_value=0.50,
value=0.10,
step=0.01,
format="%.3f",
key="gcp_k8s",
help="Default: $0.10/hr"
)
st.markdown("##### GPU Pricing")
col1, col2 = st.columns(2)
with col1:
gcp_h100 = st.number_input("H100 80GB", value=11.06, step=0.10, format="%.2f", key="gcp_h100")
gcp_a100_80 = st.number_input("A100 80GB", value=2.48, step=0.10, format="%.2f", key="gcp_a100_80")
with col2:
gcp_a100_40 = st.number_input("A100 40GB", value=1.46, step=0.10, format="%.2f", key="gcp_a100_40")
# On-Premise Pricing
with st.expander("🏢 Customize On-Premise Costs", expanded=False):
st.markdown("**Adjust on-premise costs based on your infrastructure**")
st.markdown("##### Compute Nodes (per hour)")
onprem_standard_node = st.number_input(
"Standard Node (8 vCPU, 32GB)",
min_value=0.01,
max_value=1.00,
value=0.192,
step=0.01,
format="%.3f",
key="onprem_standard",
help="Cost per hour for standard compute nodes (default: $0.192)"
)
onprem_vectordb_node = st.number_input(
"VectorDB Node (16 vCPU, 64GB)",
min_value=0.01,
max_value=2.00,
value=0.384,
step=0.01,
format="%.3f",
key="onprem_vectordb",
help="Cost per hour for high-memory VectorDB nodes (default: $0.384)"
)
onprem_jump_host = st.number_input(
"Jump Host (2 vCPU, 8GB)",
min_value=0.01,
max_value=0.50,
value=0.048,
step=0.01,
format="%.3f",
key="onprem_jump",
help="Cost per hour for jump host (default: $0.048)"
)
st.markdown("##### GPU Pricing Multiplier")
onprem_gpu_multiplier = st.slider(
"GPU Cost Multiplier (% of AWS)",
min_value=30,
max_value=100,
value=55,
step=5,
key="onprem_gpu_mult",
help="Percentage of AWS GPU pricing for on-premise (default: 55%)"
) / 100
st.markdown("##### Additional Services (per hour)")
onprem_network = st.number_input(
"Network Infrastructure",
min_value=0.0,
max_value=0.10,
value=0.020,
step=0.005,
format="%.3f",
key="onprem_network",
help="Switches, routers, firewalls (default: $0.020)"
)
onprem_storage_per_gb = st.number_input(
"Storage (per GB per month)",
min_value=0.01,
max_value=0.20,
value=0.05,
step=0.01,
format="%.3f",
key="onprem_storage",
help="SAN/NAS storage cost (default: $0.05/GB/month)"
)
onprem_load_balancer = st.number_input(
"Hardware Load Balancer",
min_value=0.0,
max_value=0.05,
value=0.010,
step=0.005,
format="%.3f",
key="onprem_lb",
help="Load balancer amortized cost (default: $0.010)"
)
onprem_power_cooling = st.number_input(
"Power & Cooling",
min_value=0.01,
max_value=0.10,
value=0.030,
step=0.005,
format="%.3f",
key="onprem_power",
help="Datacenter power and cooling (default: $0.030)"
)
onprem_datacenter_space = st.number_input(
"Datacenter Space",
min_value=0.0,
max_value=0.05,
value=0.015,
step=0.005,
format="%.3f",
key="onprem_space",
help="Rack space and facilities (default: $0.015)"
)
onprem_maintenance = st.number_input(
"Maintenance & Support",
min_value=0.0,
max_value=0.10,
value=0.025,
step=0.005,
format="%.3f",
key="onprem_maint",
help="Hardware maintenance contracts (default: $0.025)"
)
onprem_k8s_management = st.number_input(
"K8s Management Cost",
min_value=0.0,
max_value=0.20,
value=0.05,
step=0.01,
format="%.3f",
key="onprem_k8s",
help="Self-managed K8s operational cost (default: $0.05)"
)
# Reset button
if st.button("🔄 Reset All Pricing to Defaults", type="secondary"):
st.rerun()
# LLM Configuration
st.subheader("LLM Settings")
selected_model = st.selectbox(
"Select LLM Model",
list(MODELS.keys()),
index=2, # Default to Llama 3.3 70B
help="Choose the LLM model for inference workloads"
)
selected_gpu = st.selectbox(
"Select GPU Type",
list(GPUS.keys()),
index=1, # Default to H100 80GB
help="GPU type for LLM inference nodes"
)
precision = st.selectbox(
"Model Precision",
["FP16", "INT8", "INT4"],
index=0, # Default to FP16
help="Model precision affects memory usage and quality"
)
# Workload Configuration
st.subheader("Workload Settings")
conversations_per_minute = st.number_input(
"Conversations per Minute",
min_value=1,
max_value=5000,
value=200,
step=10,
help="Expected conversation throughput across all tenants"
)
tokens_per_conversation = st.number_input(
"Tokens per Conversation",
min_value=500,
max_value=20000,
value=2000,
step=100,
help="Average tokens per conversation (input + output)"
)
# Time period
time_period = st.selectbox(
"Cost Calculation Period",
["Monthly (30 days)", "Weekly (7 days)", "Daily (1 day)", "Hourly"],
index=0
)
days_map = {
"Monthly (30 days)": 30,
"Weekly (7 days)": 7,
"Daily (1 day)": 1,
"Hourly": 1/24
}
days = days_map[time_period]
# Calculate all requirements
infrastructure = calculate_detailed_infrastructure(num_tenants, apps_per_tenant)
# Apply custom pricing - create modified copies to avoid global state issues
def apply_custom_pricing():
"""Apply user-configured pricing to global dictionaries"""
# Update AWS pricing with user-configured values
CLOUD_PRICING['AWS']['cost_per_node_hour'] = aws_standard_node
CLOUD_PRICING['AWS']['vectordb_node']['cost_per_hour'] = aws_vectordb_node
CLOUD_PRICING['AWS']['jump_host']['cost_per_hour'] = aws_jump_host
CLOUD_PRICING['AWS']['managed_k8s_cost'] = aws_k8s_management
# Update AWS GPU pricing
GPUS["H200 141GB"]["pricing"]["aws"] = aws_h200
GPUS["H100 80GB"]["pricing"]["aws"] = aws_h100
GPUS["A100 80GB"]["pricing"]["aws"] = aws_a100_80
GPUS["A100 40GB"]["pricing"]["aws"] = aws_a100_40
GPUS["L40S"]["pricing"]["aws"] = aws_l40s
# Update Azure pricing with user-configured values
CLOUD_PRICING['Azure']['cost_per_node_hour'] = azure_standard_node
CLOUD_PRICING['Azure']['vectordb_node']['cost_per_hour'] = azure_vectordb_node
CLOUD_PRICING['Azure']['jump_host']['cost_per_hour'] = azure_jump_host
CLOUD_PRICING['Azure']['managed_k8s_cost'] = azure_k8s_management
# Update Azure GPU pricing
GPUS["H200 141GB"]["pricing"]["azure"] = azure_h200
GPUS["H100 80GB"]["pricing"]["azure"] = azure_h100
GPUS["A100 80GB"]["pricing"]["azure"] = azure_a100_80
GPUS["A100 40GB"]["pricing"]["azure"] = azure_a100_40
# Update GCP pricing with user-configured values
CLOUD_PRICING['GCP']['cost_per_node_hour'] = gcp_standard_node
CLOUD_PRICING['GCP']['vectordb_node']['cost_per_hour'] = gcp_vectordb_node
CLOUD_PRICING['GCP']['jump_host']['cost_per_hour'] = gcp_jump_host
CLOUD_PRICING['GCP']['managed_k8s_cost'] = gcp_k8s_management
# Update GCP GPU pricing
GPUS["H100 80GB"]["pricing"]["gcp"] = gcp_h100
GPUS["A100 80GB"]["pricing"]["gcp"] = gcp_a100_80
GPUS["A100 40GB"]["pricing"]["gcp"] = gcp_a100_40
# Update On-Premise pricing with user-configured values
CLOUD_PRICING['On-Premise']['cost_per_node_hour'] = onprem_standard_node
CLOUD_PRICING['On-Premise']['vectordb_node']['cost_per_hour'] = onprem_vectordb_node
CLOUD_PRICING['On-Premise']['jump_host']['cost_per_hour'] = onprem_jump_host
CLOUD_PRICING['On-Premise']['managed_k8s_cost'] = onprem_k8s_management
# Update on-premise additional services
CLOUD_PRICING['On-Premise']['additional_services'] = {
'Network_Infrastructure': {'cost_per_hour': onprem_network, 'description': 'Switches, routers, firewalls (amortized)'},
'Storage_SAN': {'cost_per_gb_month': onprem_storage_per_gb, 'description': 'SAN/NAS storage (1TB base, amortized)'},
'Hardware_Load_Balancer': {'cost_per_hour': onprem_load_balancer, 'description': 'F5/Citrix ADC (amortized)'},
'Power_Cooling': {'cost_per_hour': onprem_power_cooling, 'description': 'Datacenter power and cooling'},
'Datacenter_Space': {'cost_per_hour': onprem_datacenter_space, 'description': 'Rack space and facilities costs'},
'Maintenance_Support': {'cost_per_hour': onprem_maintenance, 'description': 'Hardware maintenance and vendor support contracts'}
}
# Update on-premise GPU pricing based on AWS prices and multiplier
for gpu_name in GPUS.keys():
if 'aws' in GPUS[gpu_name]['pricing'] and GPUS[gpu_name]['pricing']['aws'] != 'NA':
aws_price = GPUS[gpu_name]['pricing']['aws']
GPUS[gpu_name]['pricing']['on-premise'] = round(aws_price * onprem_gpu_multiplier, 2)
# Apply all custom pricing
apply_custom_pricing()
precision_bytes = {
"FP16": 2,
"INT8": 1,
"INT4": 0.5
}[precision]
model_spec = MODELS[selected_model]
gpu_spec = GPUS[selected_gpu]
gpu_requirements = calculate_gpu_requirements(
conversations_per_minute, tokens_per_conversation,
model_spec, gpu_spec, precision_bytes
)
# Main Dashboard
st.header("📊 Infrastructure Overview")
st.markdown("---") # Visual separator
# Row 1: Core Metrics - Use 4 columns for better spacing
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="🏢 Total Tenants",
value=f"{num_tenants}",
help="Number of tenant environments"
)
with col2:
st.metric(
label="📦 Apps per Tenant",
value=f"{apps_per_tenant}",
help=f"Total applications: {infrastructure['totals']['total_apps']}"
)
with col3:
st.metric(
label="🖥️ Worker Nodes",
value=f"{infrastructure['totals']['total_nodes']}",
help=f"Standard: {infrastructure['totals']['total_standard_nodes']}, VectorDB: {infrastructure['totals']['total_vectordb_nodes']}"
)
with col4:
gpu_display = f"{gpu_requirements['actual_gpus_allocated']} GPUs"
if gpu_requirements['best_config']:
gpu_detail = f"({gpu_requirements['best_config']['num_nodes']} nodes)"
else:
gpu_detail = ""
st.metric(
label="🎮 GPU Resources",
value=gpu_display,
delta=gpu_detail,
help=f"Configuration: {gpu_requirements['best_config']['num_nodes']}×{gpu_requirements['best_config']['gpus_per_node']} GPUs" if gpu_requirements['best_config'] else "GPU allocation"
)
# Row 2: Performance Metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="💬 Target Load",
value=f"{conversations_per_minute}",
delta="conv/min",
help="Target conversation throughput"
)
with col2:
st.metric(
label="📈 Max Capacity",
value=f"{gpu_requirements['max_conversations_per_minute']:.0f}",
delta="conv/min",
help="Maximum system capacity"
)
with col3:
capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute
headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100
st.metric(
label="📊 Capacity Headroom",
value=f"{headroom_percentage:.1f}%",
delta=f"{capacity_headroom:.0f} conv/min available",
help="Available capacity beyond current target load"
)
with col4:
bottleneck_icon = "💾" if gpu_requirements['bottleneck'] == 'Memory' else "⚡"
st.metric(
label=f"{bottleneck_icon} Bottleneck",
value=gpu_requirements['bottleneck'],
help="Primary system constraint"
)
st.markdown("---") # Visual separator
# Create tabs for detailed views
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"🗏 Platform Infrastructure",
"🖥️ GPU Requirements",
"💰 Cost Analysis",
"📈 Performance Analysis",
"🔧 Technical Specifications"
])
with tab1:
st.subheader("Platform Infrastructure Breakdown")
# Show deployment node scaling info
st.info(f"📦 **Deployment Node Scaling**: {infrastructure['totals']['deploy_nodes_per_tenant']} deployment node(s) per tenant for {apps_per_tenant} apps (1 node per 4 apps)")
# Platform nodes breakdown
breakdown_data = []
for node_type, details in infrastructure['node_breakdown'].items():
per_tenant_value = details['tenant'] // num_tenants if num_tenants > 0 and details['tenant'] > 0 else 0
breakdown_data.append({
'Node Type': node_type,
'Base': details['base'] if details['base'] > 0 else '-',
'Per Tenant': per_tenant_value if per_tenant_value > 0 else '-',
'Total': details['total'],
'CPU': details['cores'],
'RAM (GB)': details['ram'],
'VM Type': details.get('node_type', 'Standard'),
'Purpose': details['purpose']
})
breakdown_df = pd.DataFrame(breakdown_data)
# Use column configuration for better display
st.dataframe(
breakdown_df,
use_container_width=True,
hide_index=True,
column_config={
"Node Type": st.column_config.TextColumn("Node Type", width="medium"),
"Base": st.column_config.TextColumn("Base", width="small"),
"Per Tenant": st.column_config.TextColumn("Per Tenant", width="small"),
"Total": st.column_config.NumberColumn("Total", width="small"),
"CPU": st.column_config.NumberColumn("CPU", width="small"),
"RAM (GB)": st.column_config.NumberColumn("RAM (GB)", width="small"),
"VM Type": st.column_config.TextColumn("VM Type", width="medium"),
"Purpose": st.column_config.TextColumn("Purpose", width="large")
}
)
# Visual breakdown
col1, col2 = st.columns(2)
with col1:
# Node distribution pie chart
node_counts = {node_type: details['total']
for node_type, details in infrastructure['node_breakdown'].items()
if details['total'] > 0}
fig_nodes = px.pie(
values=list(node_counts.values()),
names=list(node_counts.keys()),
title="Platform Node Distribution"
)
st.plotly_chart(fig_nodes, use_container_width=True)
with col2:
# Resource distribution
resource_data = []
for node_type, details in infrastructure['node_breakdown'].items():
if details['total'] > 0:
resource_data.extend([
{'Node Type': node_type, 'Resource': 'CPU Cores', 'Amount': details['cores']},
{'Node Type': node_type, 'Resource': 'RAM (GB)', 'Amount': details['ram']}
])
resource_df = pd.DataFrame(resource_data)
fig_resources = px.bar(
resource_df,
x='Node Type',
y='Amount',
color='Resource',
title='Resource Distribution by Node Type',
barmode='group'
)
st.plotly_chart(fig_resources, use_container_width=True)
# Node type distribution
st.subheader("Node Type Distribution")
col1, col2 = st.columns(2)
with col1:
st.metric(
"Standard Nodes (8 vCPU, 32GB RAM)",
infrastructure['totals']['total_standard_nodes'],
help="Platform, Compute, and Deploy nodes"
)
with col2:
st.metric(
"High-Memory Nodes (16 vCPU, 64GB RAM)",
infrastructure['totals']['total_vectordb_nodes'],
help="VectorDB nodes with higher memory capacity"
)
with tab2:
st.subheader("GPU Requirements Analysis")
# GPU requirements metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
"Memory-based GPUs",
gpu_requirements['gpus_needed_memory'],
help="GPUs needed to fit model in memory"
)
with col2:
st.metric(
"Throughput-based GPUs",
gpu_requirements['gpus_needed_throughput'],
help="GPUs needed for required throughput"
)
with col3:
st.metric(
"Logical GPUs Needed",
gpu_requirements['total_gpus_needed'],
help="Minimum GPUs needed (before node configuration)"
)
with col4:
st.metric(
"Actual GPUs Allocated",
gpu_requirements['actual_gpus_allocated'],
help="GPUs allocated based on standard node configurations",
delta=gpu_requirements['actual_gpus_allocated'] - gpu_requirements['total_gpus_needed']
)
# GPU Node Configuration Analysis
st.subheader("🖥️ GPU Node Configuration Options")
if gpu_requirements['gpu_configurations']:
# Display configuration options in a table
config_data = []
for config in gpu_requirements['gpu_configurations']:
efficiency_score = f"{config['utilization']:.1f}%"
memory_compatible = "✅" if config['meets_memory_req'] else "❌"
config_data.append({
"GPUs/Node": config['gpus_per_node'],
"Mem": memory_compatible,
"Nodes": config['num_nodes'],
"Total GPUs": config['total_gpus_allocated'],
"GPU Util": efficiency_score,
"Waste": config['gpu_waste'],
"Mem Util": f"{config['memory_utilization']:.1f}%"
})
config_df = pd.DataFrame(config_data)
st.dataframe(
config_df,
use_container_width=True,
hide_index=True,
column_config={
"GPUs/Node": st.column_config.NumberColumn("GPUs/Node", width="small"),
"Mem": st.column_config.TextColumn("Mem ✓", width="small"),
"Nodes": st.column_config.NumberColumn("Nodes", width="small"),
"Total GPUs": st.column_config.NumberColumn("Total GPUs", width="small"),
"GPU Util": st.column_config.TextColumn("GPU Util", width="small"),
"Waste": st.column_config.NumberColumn("Waste", width="small"),
"Mem Util": st.column_config.TextColumn("Mem Util", width="small")
}
)
# Highlight the recommended configuration
if gpu_requirements['best_config']:
best = gpu_requirements['best_config']
st.success(f"💡 **Recommended Configuration**: {best['num_nodes']} nodes × {best['gpus_per_node']} GPUs = {best['total_gpus_allocated']} total GPUs ({best['utilization']:.1f}% utilization)")
# Show minimum requirement info
st.info(f"**Memory Constraint**: Minimum {gpu_requirements['min_gpus_per_node']} GPUs per node required to fit {gpu_requirements['model_memory_gb']:.1f}GB model in {gpu_spec['memory']}GB GPU memory")
# GPU configuration visualization
col1, col2 = st.columns(2)
with col1:
# Node configuration comparison
if gpu_requirements['gpu_configurations']:
config_chart_data = pd.DataFrame(gpu_requirements['gpu_configurations'])
fig_configs = px.bar(
config_chart_data,
x='gpus_per_node',
y='utilization',
title='GPU Utilization by Node Configuration',
labels={'gpus_per_node': 'GPUs per Node', 'utilization': 'Utilization (%)'}
)
st.plotly_chart(fig_configs, use_container_width=True)
with col2:
# GPU allocation vs requirement
allocation_data = pd.DataFrame({
'Metric': ['Required GPUs', 'Allocated GPUs'],
'Count': [gpu_requirements['total_gpus_needed'], gpu_requirements['actual_gpus_allocated']]
})
fig_allocation = px.bar(
allocation_data,
x='Metric',
y='Count',
title='GPU Allocation vs Requirement',
color='Metric'
)
st.plotly_chart(fig_allocation, use_container_width=True)
# Model and GPU specifications
st.subheader("🔧 Model & GPU Specifications")
# GPU configuration table
gpu_config_data = [{
'Model': selected_model,
'Parameters': f"{model_spec['params']}B ({model_spec['active_params']}B active)" if model_spec['params'] != model_spec['active_params'] else f"{model_spec['params']}B",
'Model Memory Required': f"{gpu_requirements['model_memory_gb']:.1f} GB",
'GPU Type': selected_gpu,
'GPU Memory per Unit': f"{gpu_spec['memory']} GB",
'GPUs Required (Logic)': gpu_requirements['total_gpus_needed'],
'GPUs Allocated (Actual)': gpu_requirements['actual_gpus_allocated'],
'GPU Nodes': f"{gpu_requirements['best_config']['num_nodes']} nodes × {gpu_requirements['best_config']['gpus_per_node']} GPUs" if gpu_requirements['best_config'] else 'N/A',
'Total GPU Memory': f"{gpu_requirements['actual_gpus_allocated'] * gpu_spec['memory']} GB",
'Memory Utilization': f"{(gpu_requirements['model_memory_gb'] / (gpu_requirements['actual_gpus_allocated'] * gpu_spec['memory']) * 100):.1f}%",
'Precision': precision
}]
gpu_config_df = pd.DataFrame(gpu_config_data)
st.dataframe(gpu_config_df, use_container_width=True)
# Performance metrics
col1, col2 = st.columns(2)
with col1:
# TPS comparison
tps_data = pd.DataFrame({
'Metric': ['Required TPS', 'Single GPU TPS', 'Total System TPS'],
'Value': [
gpu_requirements['required_tps'],
gpu_requirements['estimated_tps'],
gpu_requirements['total_system_tps']
]
})
fig_tps = px.bar(
tps_data,
x='Metric',
y='Value',
title='Tokens Per Second Analysis',
color='Metric'
)
st.plotly_chart(fig_tps, use_container_width=True)
with col2:
# Capacity utilization
utilization_data = pd.DataFrame({
'Metric': ['Required Capacity', 'Available Capacity'],
'Conversations/Min': [
conversations_per_minute,
gpu_requirements['max_conversations_per_minute']
]
})
fig_capacity = px.bar(
utilization_data,
x='Metric',
y='Conversations/Min',
title='Conversation Capacity Analysis',
color='Metric'
)
st.plotly_chart(fig_capacity, use_container_width=True)
with tab3:
st.subheader("Comprehensive Cost Analysis")
# Show customization status for all providers
default_values = {
'aws': {'standard': 0.384, 'vectordb': 0.768, 'jump': 0.096, 'k8s': 0.10},
'azure': {'standard': 0.384, 'vectordb': 0.768, 'jump': 0.096, 'k8s': 0.0},
'gcp': {'standard': 0.379, 'vectordb': 0.758, 'jump': 0.067, 'k8s': 0.10},
'onprem': {'standard': 0.192, 'vectordb': 0.384, 'jump': 0.048, 'gpu_mult': 0.55, 'k8s': 0.05}
}
customizations = []
# Check AWS customizations
if (aws_standard_node != default_values['aws']['standard'] or
aws_vectordb_node != default_values['aws']['vectordb'] or
aws_k8s_management != default_values['aws']['k8s']):
customizations.append("AWS")
# Check Azure customizations
if (azure_standard_node != default_values['azure']['standard'] or
azure_vectordb_node != default_values['azure']['vectordb'] or
azure_k8s_management != default_values['azure']['k8s']):
customizations.append("Azure")
# Check GCP customizations
if (gcp_standard_node != default_values['gcp']['standard'] or
gcp_vectordb_node != default_values['gcp']['vectordb'] or
gcp_k8s_management != default_values['gcp']['k8s']):
customizations.append("GCP")
# Check On-Premise customizations
if (onprem_standard_node != default_values['onprem']['standard'] or
onprem_vectordb_node != default_values['onprem']['vectordb'] or
onprem_gpu_multiplier != default_values['onprem']['gpu_mult'] or
onprem_k8s_management != default_values['onprem']['k8s']):
customizations.append("On-Premise")
if customizations:
st.warning(f"""
**✏️ Custom Pricing Active for: {', '.join(customizations)}**
Using user-configured pricing instead of defaults. View details in Technical Specifications tab or adjust in sidebar.
""")
# Add info box about cost models
st.info("""
**💡 Cost Model Information**:
- **Cloud Providers (AWS/Azure/GCP)**: Pay-as-you-go pricing with per-hour compute and GPU costs
- **On-Premise**: Hardware amortized over 3-year lifecycle + operating costs (power, cooling, maintenance)
- **Customization**: All pricing values can be adjusted in the sidebar to match your actual costs
**🔧 Customize:** Use the sidebar "Cloud Provider Pricing" sections to adjust costs
""")
# Calculate costs for all providers
all_costs = {}
for provider in CLOUD_PRICING.keys():
all_costs[provider] = calculate_detailed_costs(
provider, infrastructure, gpu_requirements, gpu_spec, days
)
# Cost comparison table
cost_comparison_data = []
for provider, costs in all_costs.items():
gpu_available = costs['totals']['gpu_available']
cost_comparison_data.append({
'Provider': provider,
'GPU': '✅' if gpu_available else '❌',
'Platform': f"${costs['totals']['platform_cost']:.2f}",
'GPU Cost': format_cost_for_display(costs['totals']['gpu_cost'], gpu_available),
'Total': format_cost_for_display(costs['totals']['total_cost'], gpu_available),
'Per Hour': format_cost_for_display(costs['totals']['cost_per_hour'], gpu_available),
'Per Day': format_cost_for_display(costs['totals']['cost_per_day'], gpu_available),
'Total_Numeric': costs['totals']['total_cost'] if gpu_available else None,
'GPU_Available': gpu_available
})
cost_df = pd.DataFrame(cost_comparison_data)
display_cost_df = cost_df.drop(['Total_Numeric', 'GPU_Available'], axis=1)
st.dataframe(
display_cost_df,
use_container_width=True,
hide_index=True,
column_config={
"Provider": st.column_config.TextColumn("Provider", width="medium"),
"GPU": st.column_config.TextColumn("GPU ✓", width="small"),
"Platform": st.column_config.TextColumn("Platform Cost", width="medium"),
"GPU Cost": st.column_config.TextColumn("GPU Cost", width="medium"),
"Total": st.column_config.TextColumn("Total Cost", width="medium"),
"Per Hour": st.column_config.TextColumn("$/Hour", width="medium"),
"Per Day": st.column_config.TextColumn("$/Day", width="medium")
}
)
# Add download button for cost report
report_data = create_downloadable_cost_report(
all_costs, infrastructure, gpu_requirements, model_spec, gpu_spec,
selected_model, selected_gpu, num_tenants, apps_per_tenant, conversations_per_minute,
tokens_per_conversation, precision, time_period
)
st.download_button(
label="📥 Download Complete Cost Report (JSON)",
data=json.dumps(report_data, indent=2),
file_name=f"llmops_cost_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json",
help="Download comprehensive cost analysis with all services and configurations"
)
# Create CSV version for easier viewing
csv_data = []
for provider, provider_data in report_data['cost_breakdown_by_provider'].items():
if provider_data['gpu_available']:
csv_data.append({
'Provider': provider,
'GPU_Available': 'Yes',
'Platform_Nodes': infrastructure['totals']['total_standard_nodes'],
'VectorDB_Nodes': infrastructure['totals']['total_vectordb_nodes'],
'GPU_Nodes': gpu_requirements['total_gpus_needed'],
'Kubernetes_Nodes_Cost': provider_data['platform_costs']['kubernetes_nodes'],
'VectorDB_Nodes_Cost': provider_data['platform_costs']['vectordb_nodes'],
'Jump_Host_Cost': provider_data['platform_costs']['jump_host'],
'Additional_Services_Cost': provider_data['platform_costs']['additional_services'],
'K8s_Management_Cost': provider_data['platform_costs']['k8s_management'],
'Total_Platform_Cost': provider_data['platform_costs']['platform_total'],
'GPU_Cost_Per_Hour': provider_data['gpu_costs']['gpu_cost_per_hour'],
'Total_GPU_Cost': provider_data['gpu_costs']['total_gpu_cost'],
'Total_Infrastructure_Cost': provider_data['totals']['total_cost'],
'Cost_Per_Hour': provider_data['totals']['cost_per_hour'],
'Cost_Per_Day': provider_data['totals']['cost_per_day']
})
else:
csv_data.append({
'Provider': provider,
'GPU_Available': 'No',
'Platform_Nodes': infrastructure['totals']['total_standard_nodes'],
'VectorDB_Nodes': infrastructure['totals']['total_vectordb_nodes'],
'GPU_Nodes': 'N/A',
'Kubernetes_Nodes_Cost': provider_data['platform_costs']['kubernetes_nodes'],
'VectorDB_Nodes_Cost': provider_data['platform_costs']['vectordb_nodes'],
'Jump_Host_Cost': provider_data['platform_costs']['jump_host'],
'Additional_Services_Cost': provider_data['platform_costs']['additional_services'],
'K8s_Management_Cost': provider_data['platform_costs']['k8s_management'],
'Total_Platform_Cost': provider_data['platform_costs']['platform_total'],
'GPU_Cost_Per_Hour': 'N/A',
'Total_GPU_Cost': 'N/A',
'Total_Infrastructure_Cost': 'N/A',
'Cost_Per_Hour': 'N/A',
'Cost_Per_Day': 'N/A'
})
csv_df = pd.DataFrame(csv_data)
csv_string = csv_df.to_csv(index=False)
st.download_button(
label="📊 Download Cost Summary (CSV)",
data=csv_string,
file_name=f"llmops_cost_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv",
help="Download cost summary in CSV format for spreadsheet analysis"
)
# Cost breakdown visualization - only for providers with GPU available
available_providers_data = cost_df[cost_df['GPU_Available'] == True]
col1, col2 = st.columns(2)
with col1:
# Provider comparison - only available providers
if not available_providers_data.empty:
fig_provider_comparison = px.bar(
available_providers_data,
x='Provider',
y='Total_Numeric',
title=f'Total Cost Comparison ({time_period}) - All Deployment Options',
labels={'Total_Numeric': 'Total Cost (USD)'},
color='Provider'
)
st.plotly_chart(fig_provider_comparison, use_container_width=True)
else:
st.warning("⚠️ No providers have the selected GPU available for cost comparison")
with col2:
# Cost breakdown for selected provider (cheapest available)
available_providers = get_available_providers_for_gpu(gpu_spec)
if available_providers:
cheapest_provider = min(available_providers,
key=lambda x: all_costs[x]['totals']['total_cost'])
cheapest_costs = all_costs[cheapest_provider]
breakdown_values = [
cheapest_costs['totals']['platform_cost'],
cheapest_costs['totals']['gpu_cost']
]
breakdown_labels = ['Platform Infrastructure', 'GPU Infrastructure']
fig_breakdown = px.pie(
values=breakdown_values,
names=breakdown_labels,
title=f'{cheapest_provider} - Cost Breakdown'
)
st.plotly_chart(fig_breakdown, use_container_width=True)
else:
st.warning("⚠️ No providers have the selected GPU available")
# Detailed cost breakdown for cheapest available provider
available_providers = get_available_providers_for_gpu(gpu_spec)
if available_providers:
cheapest_provider = min(available_providers,
key=lambda x: all_costs[x]['totals']['total_cost'])
st.subheader(f"💡 Most Cost-Effective Option: {cheapest_provider}")
if cheapest_provider == 'On-Premise':
st.success(f"✅ **On-Premise deployment offers the lowest cost** with {selected_gpu}")
st.info("💰 **Note**: On-premise costs assume 3-year hardware amortization. Initial capex and datacenter setup costs are not included in hourly rates.")
else:
st.info(f"✅ **{selected_gpu} is available on {cheapest_provider}**")
cheapest_costs = all_costs[cheapest_provider]
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Platform Infrastructure",
f"${cheapest_costs['totals']['platform_cost']:.2f}",
help="Kubernetes nodes (including VectorDB), networking, storage, management"
)
with col2:
st.metric(
"GPU Infrastructure",
f"${cheapest_costs['totals']['gpu_cost']:.2f}",
help=f"{gpu_requirements['total_gpus_needed']} x {selected_gpu}"
)
with col3:
# Calculate savings compared to most expensive available provider
if len(available_providers) > 1:
most_expensive_available = max(available_providers,
key=lambda x: all_costs[x]['totals']['total_cost'])
savings = all_costs[most_expensive_available]['totals']['total_cost'] - cheapest_costs['totals']['total_cost']
savings_pct = (savings / all_costs[most_expensive_available]['totals']['total_cost']) * 100
st.metric(
"Potential Savings",
f"${savings:.2f}",
help=f"Savings compared to {most_expensive_available} ({savings_pct:.1f}%)"
)
else:
st.metric(
"Provider Status",
"Only Option",
help="This is the only provider with the selected GPU available"
)
# Cloud vs On-Premise comparison if both are available
if 'On-Premise' in available_providers and len(available_providers) > 1:
st.subheader("☁️ Cloud vs 🏢 On-Premise Comparison")
onprem_cost = all_costs['On-Premise']['totals']['total_cost']
cloud_providers = [p for p in available_providers if p != 'On-Premise']
comparison_data = []
for provider in ['On-Premise'] + cloud_providers:
comparison_data.append({
'Deployment Type': 'On-Premise' if provider == 'On-Premise' else 'Cloud',
'Provider': provider,
'Total Cost': all_costs[provider]['totals']['total_cost'],
'Platform Cost': all_costs[provider]['totals']['platform_cost'],
'GPU Cost': all_costs[provider]['totals']['gpu_cost']
})
comp_df = pd.DataFrame(comparison_data)
# Create grouped bar chart
fig_comparison = go.Figure()
fig_comparison.add_trace(go.Bar(
name='Platform Cost',
x=comp_df['Provider'],
y=comp_df['Platform Cost'],
marker_color='lightblue'
))
fig_comparison.add_trace(go.Bar(
name='GPU Cost',
x=comp_df['Provider'],
y=comp_df['GPU Cost'],
marker_color='orange'
))
fig_comparison.update_layout(
title='Cost Breakdown: On-Premise vs Cloud',
xaxis_title='Provider',
yaxis_title='Cost (USD)',
barmode='stack'
)
st.plotly_chart(fig_comparison, use_container_width=True)
# Calculate average cloud cost
avg_cloud_cost = sum([all_costs[p]['totals']['total_cost'] for p in cloud_providers]) / len(cloud_providers)
cloud_savings = avg_cloud_cost - onprem_cost
cloud_savings_pct = (cloud_savings / avg_cloud_cost) * 100
if cloud_savings > 0:
st.success(f"💰 **On-Premise Savings**: ${cloud_savings:.2f} ({cloud_savings_pct:.1f}%) compared to average cloud cost over {time_period}")
else:
st.info(f"☁️ **Cloud is more cost-effective** for this configuration over {time_period}")
else:
st.error(f"❌ **No Providers Available**: The selected GPU ({selected_gpu}) is not available on any deployment option")
st.warning("**Recommendation**: Please select a different GPU model that is available")
# Show which GPUs are available on which providers
st.subheader("🔍 GPU Availability by Provider")
availability_data = []
for gpu_name, gpu_data in GPUS.items():
available_on = get_available_providers_for_gpu(gpu_data)
availability_data.append({
'GPU Model': gpu_name,
'Memory': f"{gpu_data['memory']} GB",
'Available On': ', '.join(available_on) if available_on else 'None',
'Deployment Options': len(available_on)
})
availability_df = pd.DataFrame(availability_data)
availability_df = availability_df.sort_values('Deployment Options', ascending=False)
st.dataframe(availability_df, use_container_width=True)
with tab4:
st.subheader("Performance Analysis & Scaling")
# Performance metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Total System TPS",
f"{gpu_requirements['total_system_tps']:.0f}",
help="Combined throughput of all GPUs"
)
with col2:
st.metric(
"Conversation Capacity",
f"{gpu_requirements['max_conversations_per_minute']:.0f}/min",
help="Maximum conversations the system can handle"
)
with col3:
capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute
headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100
st.metric(
"Capacity Headroom",
f"{headroom_percentage:.1f}%",
delta=f"{capacity_headroom:.0f} conv/min available",
help="Available capacity beyond current target load"
)
# Scaling analysis
st.subheader("Scaling Analysis")
# Create scaling scenarios
scaling_scenarios = [0.5, 1.0, 1.5, 2.0, 3.0, 5.0]
scaling_data = []
for multiplier in scaling_scenarios:
scaled_conversations = int(conversations_per_minute * multiplier)
scaled_gpu_reqs = calculate_gpu_requirements(
scaled_conversations, tokens_per_conversation,
model_spec, gpu_spec, precision_bytes
)
scaling_data.append({
'Load Multiplier': f"{multiplier}x",
'Conversations/Min': scaled_conversations,
'Logical GPUs': scaled_gpu_reqs['total_gpus_needed'],
'Allocated GPUs': scaled_gpu_reqs['actual_gpus_allocated'],
'GPU Nodes': f"{scaled_gpu_reqs['best_config']['num_nodes']}×{scaled_gpu_reqs['best_config']['gpus_per_node']}" if scaled_gpu_reqs['best_config'] else 'N/A',
'System Capacity': f"{scaled_gpu_reqs['max_conversations_per_minute']:.0f}",
'Headroom %': f"{((scaled_gpu_reqs['max_conversations_per_minute'] - scaled_conversations) / scaled_gpu_reqs['max_conversations_per_minute'] * 100):.1f}%"
})
scaling_df = pd.DataFrame(scaling_data)
st.dataframe(
scaling_df,
use_container_width=True,
hide_index=True,
column_config={
"Load Multiplier": st.column_config.TextColumn("Load", width="small"),
"Conversations/Min": st.column_config.NumberColumn("Conv/Min", width="small"),
"Logical GPUs": st.column_config.NumberColumn("Logical", width="small"),
"Allocated GPUs": st.column_config.NumberColumn("Allocated", width="small"),
"GPU Nodes": st.column_config.TextColumn("GPU Nodes", width="medium"),
"System Capacity": st.column_config.TextColumn("Capacity", width="medium"),
"Headroom %": st.column_config.TextColumn("Headroom %", width="small")
}
)
# Scaling visualization
fig_scaling = go.Figure()
# Add lines for both logical and allocated GPUs
fig_scaling.add_trace(go.Scatter(
x=[float(x.replace('x', '')) for x in scaling_df['Load Multiplier']],
y=scaling_df['Logical GPUs'].astype(int),
mode='lines+markers',
name='Logical GPUs Required',
line=dict(color='blue', dash='dash')
))
fig_scaling.add_trace(go.Scatter(
x=[float(x.replace('x', '')) for x in scaling_df['Load Multiplier']],
y=scaling_df['Allocated GPUs'].astype(int),
mode='lines+markers',
name='Allocated GPUs (Actual)',
line=dict(color='red')
))
fig_scaling.update_layout(
title='GPU Scaling Requirements (Logical vs Allocated)',
xaxis_title='Load Multiplier',
yaxis_title='Number of GPUs'
)
st.plotly_chart(fig_scaling, use_container_width=True)
# Application scaling analysis
st.subheader("Application Scaling Analysis")
app_scaling_scenarios = [4, 8, 12, 16, 20, 24, 32, 40]
app_scaling_data = []
for apps in app_scaling_scenarios:
deploy_nodes = math.ceil(apps / 4)
app_scaling_data.append({
'Apps per Tenant': apps,
'Total Apps': apps * num_tenants,
'Deploy Nodes per Tenant': deploy_nodes,
'Total Deploy Nodes': deploy_nodes * num_tenants,
'Deploy Node Ratio': f"1:{4 if apps >= 4 else apps}"
})
app_scaling_df = pd.DataFrame(app_scaling_data)
st.dataframe(
app_scaling_df,
use_container_width=True,
hide_index=True,
column_config={
"Apps per Tenant": st.column_config.NumberColumn("Apps/Tenant", width="small"),
"Total Apps": st.column_config.NumberColumn("Total Apps", width="small"),
"Deploy Nodes per Tenant": st.column_config.NumberColumn("Deploy/Tenant", width="small"),
"Total Deploy Nodes": st.column_config.NumberColumn("Total Deploy", width="medium"),
"Deploy Node Ratio": st.column_config.TextColumn("Ratio", width="small")
}
)
# App scaling visualization
fig_app_scaling = px.line(
app_scaling_df,
x='Apps per Tenant',
y='Total Deploy Nodes',
title='Deployment Nodes Scaling with Application Count',
markers=True
)
st.plotly_chart(fig_app_scaling, use_container_width=True)
with tab5:
st.subheader("Technical Specifications")
# Model specifications
st.markdown("### 🤖 LLM Model Specifications")
model_specs_data = [{
'Property': 'Model Name',
'Value': selected_model
}, {
'Property': 'Organization',
'Value': model_spec['org']
}, {
'Property': 'Total Parameters',
'Value': f"{model_spec['params']}B"
}, {
'Property': 'Active Parameters',
'Value': f"{model_spec['active_params']}B"
}, {
'Property': 'Max Context Length',
'Value': f"{model_spec['max_context']:,} tokens"
}, {
'Property': 'Base TPS',
'Value': f"{model_spec['base_tps']:,}"
}, {
'Property': 'License',
'Value': model_spec['license']
}, {
'Property': 'Architecture Type',
'Value': 'Mixture of Experts (MoE)' if model_spec['params'] != model_spec['active_params'] else 'Dense Model'
}]
model_specs_df = pd.DataFrame(model_specs_data)
st.dataframe(model_specs_df, use_container_width=True)
# GPU specifications
st.markdown("### 🖥️ GPU Specifications")
gpu_specs_data = [{
'Property': 'GPU Model',
'Value': selected_gpu
}, {
'Property': 'Memory Capacity',
'Value': f"{gpu_spec['memory']} GB"
}, {
'Property': 'Compute Capability',
'Value': gpu_spec['compute']
}, {
'Property': 'TPS Range',
'Value': f"{gpu_spec['tps_min']:,} - {gpu_spec['tps_max']:,}"
}, {
'Property': 'Efficiency Tier',
'Value': gpu_spec['efficiency_tier']
}, {
'Property': 'Model Precision',
'Value': precision
}]
gpu_specs_df = pd.DataFrame(gpu_specs_data)
st.dataframe(gpu_specs_df, use_container_width=True)
# Platform specifications
st.markdown("### 🗏 Platform Infrastructure Specifications")
platform_specs_data = [{
'Component': 'Standard K8s Nodes',
'Specification': f"{infrastructure['totals']['total_standard_nodes']} nodes × 8 vCPUs × 32GB RAM"
}, {
'Component': 'VectorDB Nodes',
'Specification': f"{infrastructure['totals']['total_vectordb_nodes']} nodes × 16 vCPUs × 64GB RAM"
}, {
'Component': 'GPU Nodes',
'Specification': f"{gpu_requirements['actual_gpus_allocated']} × {selected_gpu} ({gpu_requirements['best_config']['num_nodes']} nodes × {gpu_requirements['best_config']['gpus_per_node']} GPUs)" if gpu_requirements['best_config'] else f"{gpu_requirements['total_gpus_needed']} × {selected_gpu}"
}, {
'Component': 'Total CPU Cores',
'Specification': f"{infrastructure['totals']['total_cpu']} cores"
}, {
'Component': 'Total RAM',
'Specification': f"{infrastructure['totals']['total_ram']} GB"
}, {
'Component': 'Total GPU Memory',
'Specification': f"{gpu_requirements['actual_gpus_allocated'] * gpu_spec['memory']} GB"
}, {
'Component': 'Applications per Tenant',
'Specification': f"{apps_per_tenant} apps × {num_tenants} tenants = {infrastructure['totals']['total_apps']} total apps"
}, {
'Component': 'Deployment Nodes per Tenant',
'Specification': f"{infrastructure['totals']['deploy_nodes_per_tenant']} node(s) (1 node per 4 apps)"
}]
platform_specs_df = pd.DataFrame(platform_specs_data)
st.dataframe(platform_specs_df, use_container_width=True)
# Provider Pricing Configuration Summary
st.markdown("### 💰 Provider Pricing Configuration")
# Create tabs for each provider
price_tab1, price_tab2, price_tab3, price_tab4 = st.tabs(["AWS", "Azure", "GCP", "On-Premise"])
with price_tab1:
aws_config_data = [{
'Cost Component': 'Standard Compute Node',
'Specification': 'm5.2xlarge (8 vCPU, 32GB)',
'Cost per Hour': f"${aws_standard_node:.3f}",
'Status': '✏️ Custom' if aws_standard_node != 0.384 else '✅ Default'
}, {
'Cost Component': 'VectorDB Node',
'Specification': 'm5.4xlarge (16 vCPU, 64GB)',
'Cost per Hour': f"${aws_vectordb_node:.3f}",
'Status': '✏️ Custom' if aws_vectordb_node != 0.768 else '✅ Default'
}, {
'Cost Component': 'Jump Host',
'Specification': 'm5.large (2 vCPU, 8GB)',
'Cost per Hour': f"${aws_jump_host:.3f}",
'Status': '✏️ Custom' if aws_jump_host != 0.096 else '✅ Default'
}, {
'Cost Component': 'EKS Management',
'Specification': 'Managed Kubernetes',
'Cost per Hour': f"${aws_k8s_management:.3f}",
'Status': '✏️ Custom' if aws_k8s_management != 0.10 else '✅ Default'
}, {
'Cost Component': 'H200 141GB GPU',
'Specification': 'Flagship+ GPU',
'Cost per Hour': f"${aws_h200:.2f}",
'Status': '✏️ Custom' if aws_h200 != 15.70 else '✅ Default'
}, {
'Cost Component': 'H100 80GB GPU',
'Specification': 'Flagship GPU',
'Cost per Hour': f"${aws_h100:.2f}",
'Status': '✏️ Custom' if aws_h100 != 6.01 else '✅ Default'
}, {
'Cost Component': 'A100 80GB GPU',
'Specification': 'Excellent GPU',
'Cost per Hour': f"${aws_a100_80:.2f}",
'Status': '✏️ Custom' if aws_a100_80 != 3.43 else '✅ Default'
}, {
'Cost Component': 'A100 40GB GPU',
'Specification': 'Good GPU',
'Cost per Hour': f"${aws_a100_40:.2f}",
'Status': '✏️ Custom' if aws_a100_40 != 2.75 else '✅ Default'
}, {
'Cost Component': 'L40S GPU',
'Specification': 'Very Good GPU',
'Cost per Hour': f"${aws_l40s:.2f}",
'Status': '✏️ Custom' if aws_l40s != 1.67 else '✅ Default'
}]
aws_config_df = pd.DataFrame(aws_config_data)
st.dataframe(aws_config_df, use_container_width=True)
with price_tab2:
azure_config_data = [{
'Cost Component': 'Standard Compute Node',
'Specification': 'Standard_D8s_v3 (8 vCPU, 32GB)',
'Cost per Hour': f"${azure_standard_node:.3f}",
'Status': '✏️ Custom' if azure_standard_node != 0.384 else '✅ Default'
}, {
'Cost Component': 'VectorDB Node',
'Specification': 'Standard_D16s_v3 (16 vCPU, 64GB)',
'Cost per Hour': f"${azure_vectordb_node:.3f}",
'Status': '✏️ Custom' if azure_vectordb_node != 0.768 else '✅ Default'
}, {
'Cost Component': 'Jump Host',
'Specification': 'Standard_D2s_v3 (2 vCPU, 8GB)',
'Cost per Hour': f"${azure_jump_host:.3f}",
'Status': '✏️ Custom' if azure_jump_host != 0.096 else '✅ Default'
}, {
'Cost Component': 'AKS Management',
'Specification': 'Managed Kubernetes (Free)',
'Cost per Hour': f"${azure_k8s_management:.3f}",
'Status': '✏️ Custom' if azure_k8s_management != 0.0 else '✅ Default'
}, {
'Cost Component': 'H200 141GB GPU',
'Specification': 'Flagship+ GPU',
'Cost per Hour': f"${azure_h200:.2f}",
'Status': '✏️ Custom' if azure_h200 != 12.29 else '✅ Default'
}, {
'Cost Component': 'H100 80GB GPU',
'Specification': 'Flagship GPU',
'Cost per Hour': f"${azure_h100:.2f}",
'Status': '✏️ Custom' if azure_h100 != 6.98 else '✅ Default'
}, {
'Cost Component': 'A100 80GB GPU',
'Specification': 'Excellent GPU',
'Cost per Hour': f"${azure_a100_80:.2f}",
'Status': '✏️ Custom' if azure_a100_80 != 3.67 else '✅ Default'
}, {
'Cost Component': 'A100 40GB GPU',
'Specification': 'Good GPU',
'Cost per Hour': f"${azure_a100_40:.2f}",
'Status': '✏️ Custom' if azure_a100_40 != 3.67 else '✅ Default'
}]
azure_config_df = pd.DataFrame(azure_config_data)
st.dataframe(azure_config_df, use_container_width=True)
with price_tab3:
gcp_config_data = [{
'Cost Component': 'Standard Compute Node',
'Specification': 'n1-standard-8 (8 vCPU, 30GB)',
'Cost per Hour': f"${gcp_standard_node:.3f}",
'Status': '✏️ Custom' if gcp_standard_node != 0.379 else '✅ Default'
}, {
'Cost Component': 'VectorDB Node',
'Specification': 'n1-standard-16 (16 vCPU, 60GB)',
'Cost per Hour': f"${gcp_vectordb_node:.3f}",
'Status': '✏️ Custom' if gcp_vectordb_node != 0.758 else '✅ Default'
}, {
'Cost Component': 'Jump Host',
'Specification': 'e2-medium (2 vCPU, 8GB)',
'Cost per Hour': f"${gcp_jump_host:.3f}",
'Status': '✏️ Custom' if gcp_jump_host != 0.067 else '✅ Default'
}, {
'Cost Component': 'GKE Management',
'Specification': 'Managed Kubernetes',
'Cost per Hour': f"${gcp_k8s_management:.3f}",
'Status': '✏️ Custom' if gcp_k8s_management != 0.10 else '✅ Default'
}, {
'Cost Component': 'H100 80GB GPU',
'Specification': 'Flagship GPU',
'Cost per Hour': f"${gcp_h100:.2f}",
'Status': '✏️ Custom' if gcp_h100 != 11.06 else '✅ Default'
}, {
'Cost Component': 'A100 80GB GPU',
'Specification': 'Excellent GPU',
'Cost per Hour': f"${gcp_a100_80:.2f}",
'Status': '✏️ Custom' if gcp_a100_80 != 2.48 else '✅ Default'
}, {
'Cost Component': 'A100 40GB GPU',
'Specification': 'Good GPU',
'Cost per Hour': f"${gcp_a100_40:.2f}",
'Status': '✏️ Custom' if gcp_a100_40 != 1.46 else '✅ Default'
}]
gcp_config_df = pd.DataFrame(gcp_config_data)
st.dataframe(gcp_config_df, use_container_width=True)
with price_tab4:
onprem_config_data = [{
'Cost Component': 'Standard Compute Node',
'Specification': '8 vCPU, 32GB RAM',
'Cost per Hour': f"${onprem_standard_node:.3f}",
'Status': '✏️ Custom' if onprem_standard_node != 0.192 else '✅ Default'
}, {
'Cost Component': 'VectorDB Node',
'Specification': '16 vCPU, 64GB RAM',
'Cost per Hour': f"${onprem_vectordb_node:.3f}",
'Status': '✏️ Custom' if onprem_vectordb_node != 0.384 else '✅ Default'
}, {
'Cost Component': 'Jump Host',
'Specification': '2 vCPU, 8GB RAM',
'Cost per Hour': f"${onprem_jump_host:.3f}",
'Status': '✏️ Custom' if onprem_jump_host != 0.048 else '✅ Default'
}, {
'Cost Component': 'GPU Pricing',
'Specification': f'{onprem_gpu_multiplier*100:.0f}% of AWS pricing',
'Cost per Hour': f"${GPUS[selected_gpu]['pricing']['on-premise']:.2f} (for {selected_gpu})",
'Status': '✏️ Custom' if onprem_gpu_multiplier != 0.55 else '✅ Default'
}, {
'Cost Component': 'K8s Management',
'Specification': 'Self-managed operational cost',
'Cost per Hour': f"${onprem_k8s_management:.3f}",
'Status': '✏️ Custom' if onprem_k8s_management != 0.05 else '✅ Default'
}, {
'Cost Component': 'Network Infrastructure',
'Specification': 'Switches, routers, firewalls',
'Cost per Hour': f"${onprem_network:.3f}",
'Status': '✏️ Custom' if onprem_network != 0.020 else '✅ Default'
}, {
'Cost Component': 'Storage SAN/NAS',
'Specification': 'Per GB per month',
'Cost per Hour': f"${onprem_storage_per_gb:.3f}/GB/month",
'Status': '✏️ Custom' if onprem_storage_per_gb != 0.05 else '✅ Default'
}, {
'Cost Component': 'Hardware Load Balancer',
'Specification': 'F5/Citrix ADC amortized',
'Cost per Hour': f"${onprem_load_balancer:.3f}",
'Status': '✏️ Custom' if onprem_load_balancer != 0.010 else '✅ Default'
}, {
'Cost Component': 'Power & Cooling',
'Specification': 'Datacenter utilities',
'Cost per Hour': f"${onprem_power_cooling:.3f}",
'Status': '✏️ Custom' if onprem_power_cooling != 0.030 else '✅ Default'
}, {
'Cost Component': 'Datacenter Space',
'Specification': 'Rack space and facilities',
'Cost per Hour': f"${onprem_datacenter_space:.3f}",
'Status': '✏️ Custom' if onprem_datacenter_space != 0.015 else '✅ Default'
}, {
'Cost Component': 'Maintenance & Support',
'Specification': 'Vendor support contracts',
'Cost per Hour': f"${onprem_maintenance:.3f}",
'Status': '✏️ Custom' if onprem_maintenance != 0.025 else '✅ Default'
}]
onprem_config_df = pd.DataFrame(onprem_config_data)
st.dataframe(onprem_config_df, use_container_width=True)
st.markdown("""
**💡 Configuration Tips:**
- Adjust pricing in the sidebar under "Cloud Provider Pricing (Optional)"
- Default values based on public pricing as of 2024/2025
- Customize based on your actual contract rates, discounts, or negotiated pricing
- All calculations update automatically when values are changed
- Click "🔄 Reset All Pricing to Defaults" in sidebar to restore original values
""")
# VM Types Summary
st.markdown("### 🖥️ Deployment Options Summary")
deployment_options_data = []
for provider in CLOUD_PRICING.keys():
pricing = CLOUD_PRICING[provider]
deployment_options_data.append({
'Provider': provider,
'Standard Node': pricing['description'],
'VectorDB Node': pricing['vectordb_node']['instance_type'],
'Jump Host': pricing['jump_host']['instance_type'],
'Managed K8s': pricing['name']
})
deployment_df = pd.DataFrame(deployment_options_data)
st.dataframe(deployment_df, use_container_width=True)
# Recommendations section
st.header("💡 Recommendations & Insights")
col1, col2 = st.columns(2)
with col1:
st.subheader("🎯 Performance Recommendations")
if gpu_requirements['bottleneck'] == 'Memory':
st.info("💾 **Memory-bound workload**: Consider using INT8 or INT4 quantization to reduce memory requirements")
else:
st.info("⚡ **Throughput-bound workload**: Current memory is sufficient, focus on GPU count for throughput")
capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute
headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100
if headroom_percentage < 20:
st.warning(f"🚨 **Low headroom** ({headroom_percentage:.1f}%): System near capacity. Consider adding more GPUs or optimizing workload distribution")
elif headroom_percentage > 70:
st.success(f"✅ **High headroom** ({headroom_percentage:.1f}%): System has significant capacity for growth")
else:
st.info(f"📊 **Balanced headroom** ({headroom_percentage:.1f}%): Good balance between capacity and resource efficiency")
# Application deployment recommendations
if apps_per_tenant > 12:
st.warning(f"📦 **High app density**: {apps_per_tenant} apps per tenant requires {infrastructure['totals']['deploy_nodes_per_tenant']} deployment nodes. Consider application consolidation")
elif apps_per_tenant <= 4:
st.success(f"✅ **Efficient deployment**: Only 1 deployment node needed for {apps_per_tenant} apps per tenant")
else:
st.info(f"📊 **Moderate app density**: {infrastructure['totals']['deploy_nodes_per_tenant']} deployment nodes for {apps_per_tenant} apps per tenant")
with col2:
st.subheader("💰 Cost Optimization")
available_providers = get_available_providers_for_gpu(gpu_spec)
if len(available_providers) >= 2:
cheapest_provider = min(available_providers,
key=lambda x: all_costs[x]['totals']['total_cost'])
most_expensive_provider = max(available_providers,
key=lambda x: all_costs[x]['totals']['total_cost'])
savings = all_costs[most_expensive_provider]['totals']['total_cost'] - all_costs[cheapest_provider]['totals']['total_cost']
savings_percentage = (savings / all_costs[most_expensive_provider]['totals']['total_cost']) * 100
if cheapest_provider == 'On-Premise':
st.success(f"💡 **Recommended Option**: On-Premise Deployment")
st.info(f"💰 **Cost Advantage**: ${savings:.2f} ({savings_percentage:.1f}%) savings compared to {most_expensive_provider}")
st.warning("⚠️ **Consider**: Initial capex, datacenter readiness, and operational expertise for on-premise")
else:
st.success(f"💡 **Recommended Provider**: {cheapest_provider}")
st.info(f"💰 **Potential Savings**: ${savings:.2f} ({savings_percentage:.1f}%) compared to {most_expensive_provider}")
# Cost distribution insight
cheapest_costs = all_costs[cheapest_provider]
platform_percentage = (cheapest_costs['totals']['platform_cost'] / cheapest_costs['totals']['total_cost']) * 100
gpu_percentage = (cheapest_costs['totals']['gpu_cost'] / cheapest_costs['totals']['total_cost']) * 100
if gpu_percentage > 70:
st.warning("🖥️ **GPU-heavy costs**: Consider optimizing model size or using more efficient GPUs")
else:
st.info(f"⚖️ **Balanced infrastructure**: Platform ({platform_percentage:.0f}%) vs GPU ({gpu_percentage:.0f}%)")
elif len(available_providers) == 1:
available_provider = available_providers[0]
st.success(f"💡 **Available Option**: {available_provider}")
if available_provider == 'On-Premise':
st.info("🏢 On-premise is your only deployment option for this GPU")
else:
st.warning("⚠️ **Limited Options**: Only one provider has the selected GPU available")
# Show cost distribution for the only available provider
provider_costs = all_costs[available_provider]
platform_percentage = (provider_costs['totals']['platform_cost'] / provider_costs['totals']['total_cost']) * 100
gpu_percentage = (provider_costs['totals']['gpu_cost'] / provider_costs['totals']['total_cost']) * 100
st.info(f"📊 **Cost Distribution**: Platform ({platform_percentage:.0f}%) vs GPU ({gpu_percentage:.0f}%)")
else:
st.error("❌ **No Available Options**: Selected GPU is not available on any deployment option")
st.warning("**Action Required**: Please select a different GPU model")
# Show alternative GPUs
st.markdown("**💡 Suggested Alternatives:**")
alternatives = []
for gpu_name, gpu_data in GPUS.items():
available_on = get_available_providers_for_gpu(gpu_data)
if available_on:
alternatives.append(f"• **{gpu_name}** - Available on: {', '.join(available_on)}")
if alternatives:
for alt in alternatives[:3]: # Show top 3 alternatives
st.markdown(alt)
# Infrastructure Summary Box
st.header("📋 Infrastructure Summary")
summary_col1, summary_col2, summary_col3 = st.columns(3)
with summary_col1:
st.markdown("### Platform Infrastructure")
st.markdown(f"""
- **Tenants**: {num_tenants}
- **Apps per Tenant**: {apps_per_tenant}
- **Total Applications**: {infrastructure['totals']['total_apps']}
- **Standard Nodes**: {infrastructure['totals']['total_standard_nodes']} (8 vCPU, 32GB)
- **VectorDB Nodes**: {infrastructure['totals']['total_vectordb_nodes']} (16 vCPU, 64GB)
- **Total Platform Nodes**: {infrastructure['totals']['total_nodes']}
""")
with summary_col2:
st.markdown("### GPU Infrastructure")
st.markdown(f"""
- **Model**: {selected_model}
- **GPU Type**: {selected_gpu}
- **Precision**: {precision}
- **GPUs Required**: {gpu_requirements['total_gpus_needed']}
- **GPUs Allocated**: {gpu_requirements['actual_gpus_allocated']}
- **GPU Configuration**: {gpu_requirements['best_config']['num_nodes']} nodes × {gpu_requirements['best_config']['gpus_per_node']} GPUs
""")
with summary_col3:
st.markdown("### Performance Metrics")
capacity_headroom = gpu_requirements['max_conversations_per_minute'] - conversations_per_minute
headroom_percentage = (capacity_headroom / gpu_requirements['max_conversations_per_minute']) * 100
st.markdown(f"""
- **Target Load**: {conversations_per_minute} conv/min
- **Max Capacity**: {gpu_requirements['max_conversations_per_minute']:.0f} conv/min
- **Capacity Headroom**: {headroom_percentage:.1f}%
- **Bottleneck**: {gpu_requirements['bottleneck']}
- **Total TPS**: {gpu_requirements['total_system_tps']:.0f}
- **Tokens/Conv**: {tokens_per_conversation}
""")
if __name__ == "__main__":
create_comprehensive_dashboard()