Spaces:

jkottu
/

llm-inference-dashboard

Sleeping

App Files Files Community

llm-inference-dashboard / components /gpu_panel.py

jkottu

Initial commit: LLM Inference Dashboard

aefabf0 about 2 months ago

raw

history blame contribute delete

5.92 kB

	"""GPU status panel component."""

	import gradio as gr
	import pandas as pd
	from typing import List, Dict, Any, Tuple

	from collectors.gpu_collector import GPUCollector, GPUStats
	from utils.history import MetricHistory


	def create_gpu_panel(history: MetricHistory) -> Dict[str, Any]:
	"""
	Create the GPU status panel.

	Args:
	history: Metric history for charting

	Returns:
	Dictionary of Gradio components
	"""
	with gr.Column():
	gr.Markdown("### GPU / Rank Status")

	# GPU stats table
	gpu_table = gr.Dataframe(
	headers=["GPU", "Name", "Memory", "Memory %", "Util %", "Temp", "Power", "TP Rank"],
	datatype=["number", "str", "str", "number", "number", "str", "str", "str"],
	label="GPU Statistics",
	interactive=False,
	)

	with gr.Row():
	# Memory usage plot
	gpu_memory_plot = gr.LinePlot(
	x="time",
	y="value",
	color="gpu",
	title="GPU Memory Usage (GB)",
	x_title="Time",
	y_title="Memory (GB)",
	height=250,
	)

	# Utilization plot
	gpu_util_plot = gr.LinePlot(
	x="time",
	y="value",
	color="gpu",
	title="GPU Utilization (%)",
	x_title="Time",
	y_title="Utilization %",
	height=250,
	)

	# NCCL / Communication status
	nccl_status = gr.HTML(
	value='<div style="padding: 10px; background: #e8f5e9; border-radius: 5px;">'
	'<span style="color: #2e7d32;">NCCL Status: Healthy</span></div>',
	label="Communication Status",
	)

	return {
	"gpu_table": gpu_table,
	"gpu_memory_plot": gpu_memory_plot,
	"gpu_util_plot": gpu_util_plot,
	"nccl_status": nccl_status,
	}


	def update_gpu_panel(
	collector: GPUCollector,
	history: MetricHistory,
	) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
	"""
	Update the GPU panel with current data.

	Args:
	collector: GPU collector instance
	history: Metric history

	Returns:
	Tuple of (table_data, memory_plot_data, util_plot_data, nccl_html)
	"""
	stats = collector.collect()

	# Update history
	for stat in stats:
	history.add(
	"gpu_memory_gb",
	stat.memory_used_gb,
	labels={"gpu": str(stat.gpu_id)},
	)
	history.add(
	"gpu_util_percent",
	stat.gpu_util_percent,
	labels={"gpu": str(stat.gpu_id)},
	)

	# Build table data
	table_data = _build_table(stats)

	# Build chart data
	memory_df = _build_memory_chart_data(history)
	util_df = _build_util_chart_data(history)

	# NCCL status (simplified - would need more complex detection)
	nccl_html = _build_nccl_status(stats)

	return table_data, memory_df, util_df, nccl_html


	def _build_table(stats: List[GPUStats]) -> pd.DataFrame:
	"""Build GPU stats table."""
	rows = []
	for stat in stats:
	rows.append({
	"GPU": stat.gpu_id,
	"Name": stat.name[:20] if len(stat.name) > 20 else stat.name,
	"Memory": f"{stat.memory_used_gb:.1f}/{stat.memory_total_gb:.1f} GB",
	"Memory %": round(stat.memory_percent, 1),
	"Util %": round(stat.gpu_util_percent, 1),
	"Temp": f"{stat.temperature_c}C",
	"Power": f"{stat.power_watts:.0f}/{stat.power_limit_watts:.0f}W",
	"TP Rank": str(stat.tp_rank) if stat.tp_rank is not None else "-",
	})

	return pd.DataFrame(rows)


	def _build_memory_chart_data(history: MetricHistory) -> pd.DataFrame:
	"""Build memory usage chart data."""
	all_series = history.get_all_series("gpu_memory_gb")

	rows = []
	for key, points in all_series.items():
	gpu_id = key.split("=")[-1] if "=" in key else "0"
	for point in points[-60:]: # Last 60 points
	rows.append({
	"time": point.timestamp,
	"value": point.value,
	"gpu": f"GPU {gpu_id}",
	})

	if not rows:
	return pd.DataFrame({"time": [], "value": [], "gpu": []})

	return pd.DataFrame(rows)


	def _build_util_chart_data(history: MetricHistory) -> pd.DataFrame:
	"""Build utilization chart data."""
	all_series = history.get_all_series("gpu_util_percent")

	rows = []
	for key, points in all_series.items():
	gpu_id = key.split("=")[-1] if "=" in key else "0"
	for point in points[-60:]:
	rows.append({
	"time": point.timestamp,
	"value": point.value,
	"gpu": f"GPU {gpu_id}",
	})

	if not rows:
	return pd.DataFrame({"time": [], "value": [], "gpu": []})

	return pd.DataFrame(rows)


	def _build_nccl_status(stats: List[GPUStats]) -> str:
	"""Build NCCL status HTML."""
	if not stats:
	return (
	'<div style="padding: 10px; background: #fff3e0; border-radius: 5px;">'
	'<span style="color: #e65100;">NCCL Status: No GPUs detected</span></div>'
	)

	# Check for GPU communication health indicators
	# In a real implementation, this would check vLLM metrics for NCCL errors
	all_healthy = all(stat.gpu_util_percent > 0 or stat.memory_percent > 0 for stat in stats)

	if all_healthy:
	return (
	'<div style="padding: 10px; background: #e8f5e9; border-radius: 5px;">'
	f'<span style="color: #2e7d32;">NCCL Status: Healthy ({len(stats)} GPUs)</span></div>'
	)
	else:
	return (
	'<div style="padding: 10px; background: #ffebee; border-radius: 5px;">'
	'<span style="color: #c62828;">NCCL Status: Communication issue detected</span></div>'
	)