jkottu's picture
Initial commit: LLM Inference Dashboard
aefabf0
"""GPU status panel component."""
import gradio as gr
import pandas as pd
from typing import List, Dict, Any, Tuple
from collectors.gpu_collector import GPUCollector, GPUStats
from utils.history import MetricHistory
def create_gpu_panel(history: MetricHistory) -> Dict[str, Any]:
"""
Create the GPU status panel.
Args:
history: Metric history for charting
Returns:
Dictionary of Gradio components
"""
with gr.Column():
gr.Markdown("### GPU / Rank Status")
# GPU stats table
gpu_table = gr.Dataframe(
headers=["GPU", "Name", "Memory", "Memory %", "Util %", "Temp", "Power", "TP Rank"],
datatype=["number", "str", "str", "number", "number", "str", "str", "str"],
label="GPU Statistics",
interactive=False,
)
with gr.Row():
# Memory usage plot
gpu_memory_plot = gr.LinePlot(
x="time",
y="value",
color="gpu",
title="GPU Memory Usage (GB)",
x_title="Time",
y_title="Memory (GB)",
height=250,
)
# Utilization plot
gpu_util_plot = gr.LinePlot(
x="time",
y="value",
color="gpu",
title="GPU Utilization (%)",
x_title="Time",
y_title="Utilization %",
height=250,
)
# NCCL / Communication status
nccl_status = gr.HTML(
value='<div style="padding: 10px; background: #e8f5e9; border-radius: 5px;">'
'<span style="color: #2e7d32;">NCCL Status: Healthy</span></div>',
label="Communication Status",
)
return {
"gpu_table": gpu_table,
"gpu_memory_plot": gpu_memory_plot,
"gpu_util_plot": gpu_util_plot,
"nccl_status": nccl_status,
}
def update_gpu_panel(
collector: GPUCollector,
history: MetricHistory,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""
Update the GPU panel with current data.
Args:
collector: GPU collector instance
history: Metric history
Returns:
Tuple of (table_data, memory_plot_data, util_plot_data, nccl_html)
"""
stats = collector.collect()
# Update history
for stat in stats:
history.add(
"gpu_memory_gb",
stat.memory_used_gb,
labels={"gpu": str(stat.gpu_id)},
)
history.add(
"gpu_util_percent",
stat.gpu_util_percent,
labels={"gpu": str(stat.gpu_id)},
)
# Build table data
table_data = _build_table(stats)
# Build chart data
memory_df = _build_memory_chart_data(history)
util_df = _build_util_chart_data(history)
# NCCL status (simplified - would need more complex detection)
nccl_html = _build_nccl_status(stats)
return table_data, memory_df, util_df, nccl_html
def _build_table(stats: List[GPUStats]) -> pd.DataFrame:
"""Build GPU stats table."""
rows = []
for stat in stats:
rows.append({
"GPU": stat.gpu_id,
"Name": stat.name[:20] if len(stat.name) > 20 else stat.name,
"Memory": f"{stat.memory_used_gb:.1f}/{stat.memory_total_gb:.1f} GB",
"Memory %": round(stat.memory_percent, 1),
"Util %": round(stat.gpu_util_percent, 1),
"Temp": f"{stat.temperature_c}C",
"Power": f"{stat.power_watts:.0f}/{stat.power_limit_watts:.0f}W",
"TP Rank": str(stat.tp_rank) if stat.tp_rank is not None else "-",
})
return pd.DataFrame(rows)
def _build_memory_chart_data(history: MetricHistory) -> pd.DataFrame:
"""Build memory usage chart data."""
all_series = history.get_all_series("gpu_memory_gb")
rows = []
for key, points in all_series.items():
gpu_id = key.split("=")[-1] if "=" in key else "0"
for point in points[-60:]: # Last 60 points
rows.append({
"time": point.timestamp,
"value": point.value,
"gpu": f"GPU {gpu_id}",
})
if not rows:
return pd.DataFrame({"time": [], "value": [], "gpu": []})
return pd.DataFrame(rows)
def _build_util_chart_data(history: MetricHistory) -> pd.DataFrame:
"""Build utilization chart data."""
all_series = history.get_all_series("gpu_util_percent")
rows = []
for key, points in all_series.items():
gpu_id = key.split("=")[-1] if "=" in key else "0"
for point in points[-60:]:
rows.append({
"time": point.timestamp,
"value": point.value,
"gpu": f"GPU {gpu_id}",
})
if not rows:
return pd.DataFrame({"time": [], "value": [], "gpu": []})
return pd.DataFrame(rows)
def _build_nccl_status(stats: List[GPUStats]) -> str:
"""Build NCCL status HTML."""
if not stats:
return (
'<div style="padding: 10px; background: #fff3e0; border-radius: 5px;">'
'<span style="color: #e65100;">NCCL Status: No GPUs detected</span></div>'
)
# Check for GPU communication health indicators
# In a real implementation, this would check vLLM metrics for NCCL errors
all_healthy = all(stat.gpu_util_percent > 0 or stat.memory_percent > 0 for stat in stats)
if all_healthy:
return (
'<div style="padding: 10px; background: #e8f5e9; border-radius: 5px;">'
f'<span style="color: #2e7d32;">NCCL Status: Healthy ({len(stats)} GPUs)</span></div>'
)
else:
return (
'<div style="padding: 10px; background: #ffebee; border-radius: 5px;">'
'<span style="color: #c62828;">NCCL Status: Communication issue detected</span></div>'
)