Spaces:
Sleeping
Sleeping
| """GPU status panel component.""" | |
| import gradio as gr | |
| import pandas as pd | |
| from typing import List, Dict, Any, Tuple | |
| from collectors.gpu_collector import GPUCollector, GPUStats | |
| from utils.history import MetricHistory | |
| def create_gpu_panel(history: MetricHistory) -> Dict[str, Any]: | |
| """ | |
| Create the GPU status panel. | |
| Args: | |
| history: Metric history for charting | |
| Returns: | |
| Dictionary of Gradio components | |
| """ | |
| with gr.Column(): | |
| gr.Markdown("### GPU / Rank Status") | |
| # GPU stats table | |
| gpu_table = gr.Dataframe( | |
| headers=["GPU", "Name", "Memory", "Memory %", "Util %", "Temp", "Power", "TP Rank"], | |
| datatype=["number", "str", "str", "number", "number", "str", "str", "str"], | |
| label="GPU Statistics", | |
| interactive=False, | |
| ) | |
| with gr.Row(): | |
| # Memory usage plot | |
| gpu_memory_plot = gr.LinePlot( | |
| x="time", | |
| y="value", | |
| color="gpu", | |
| title="GPU Memory Usage (GB)", | |
| x_title="Time", | |
| y_title="Memory (GB)", | |
| height=250, | |
| ) | |
| # Utilization plot | |
| gpu_util_plot = gr.LinePlot( | |
| x="time", | |
| y="value", | |
| color="gpu", | |
| title="GPU Utilization (%)", | |
| x_title="Time", | |
| y_title="Utilization %", | |
| height=250, | |
| ) | |
| # NCCL / Communication status | |
| nccl_status = gr.HTML( | |
| value='<div style="padding: 10px; background: #e8f5e9; border-radius: 5px;">' | |
| '<span style="color: #2e7d32;">NCCL Status: Healthy</span></div>', | |
| label="Communication Status", | |
| ) | |
| return { | |
| "gpu_table": gpu_table, | |
| "gpu_memory_plot": gpu_memory_plot, | |
| "gpu_util_plot": gpu_util_plot, | |
| "nccl_status": nccl_status, | |
| } | |
| def update_gpu_panel( | |
| collector: GPUCollector, | |
| history: MetricHistory, | |
| ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]: | |
| """ | |
| Update the GPU panel with current data. | |
| Args: | |
| collector: GPU collector instance | |
| history: Metric history | |
| Returns: | |
| Tuple of (table_data, memory_plot_data, util_plot_data, nccl_html) | |
| """ | |
| stats = collector.collect() | |
| # Update history | |
| for stat in stats: | |
| history.add( | |
| "gpu_memory_gb", | |
| stat.memory_used_gb, | |
| labels={"gpu": str(stat.gpu_id)}, | |
| ) | |
| history.add( | |
| "gpu_util_percent", | |
| stat.gpu_util_percent, | |
| labels={"gpu": str(stat.gpu_id)}, | |
| ) | |
| # Build table data | |
| table_data = _build_table(stats) | |
| # Build chart data | |
| memory_df = _build_memory_chart_data(history) | |
| util_df = _build_util_chart_data(history) | |
| # NCCL status (simplified - would need more complex detection) | |
| nccl_html = _build_nccl_status(stats) | |
| return table_data, memory_df, util_df, nccl_html | |
| def _build_table(stats: List[GPUStats]) -> pd.DataFrame: | |
| """Build GPU stats table.""" | |
| rows = [] | |
| for stat in stats: | |
| rows.append({ | |
| "GPU": stat.gpu_id, | |
| "Name": stat.name[:20] if len(stat.name) > 20 else stat.name, | |
| "Memory": f"{stat.memory_used_gb:.1f}/{stat.memory_total_gb:.1f} GB", | |
| "Memory %": round(stat.memory_percent, 1), | |
| "Util %": round(stat.gpu_util_percent, 1), | |
| "Temp": f"{stat.temperature_c}C", | |
| "Power": f"{stat.power_watts:.0f}/{stat.power_limit_watts:.0f}W", | |
| "TP Rank": str(stat.tp_rank) if stat.tp_rank is not None else "-", | |
| }) | |
| return pd.DataFrame(rows) | |
| def _build_memory_chart_data(history: MetricHistory) -> pd.DataFrame: | |
| """Build memory usage chart data.""" | |
| all_series = history.get_all_series("gpu_memory_gb") | |
| rows = [] | |
| for key, points in all_series.items(): | |
| gpu_id = key.split("=")[-1] if "=" in key else "0" | |
| for point in points[-60:]: # Last 60 points | |
| rows.append({ | |
| "time": point.timestamp, | |
| "value": point.value, | |
| "gpu": f"GPU {gpu_id}", | |
| }) | |
| if not rows: | |
| return pd.DataFrame({"time": [], "value": [], "gpu": []}) | |
| return pd.DataFrame(rows) | |
| def _build_util_chart_data(history: MetricHistory) -> pd.DataFrame: | |
| """Build utilization chart data.""" | |
| all_series = history.get_all_series("gpu_util_percent") | |
| rows = [] | |
| for key, points in all_series.items(): | |
| gpu_id = key.split("=")[-1] if "=" in key else "0" | |
| for point in points[-60:]: | |
| rows.append({ | |
| "time": point.timestamp, | |
| "value": point.value, | |
| "gpu": f"GPU {gpu_id}", | |
| }) | |
| if not rows: | |
| return pd.DataFrame({"time": [], "value": [], "gpu": []}) | |
| return pd.DataFrame(rows) | |
| def _build_nccl_status(stats: List[GPUStats]) -> str: | |
| """Build NCCL status HTML.""" | |
| if not stats: | |
| return ( | |
| '<div style="padding: 10px; background: #fff3e0; border-radius: 5px;">' | |
| '<span style="color: #e65100;">NCCL Status: No GPUs detected</span></div>' | |
| ) | |
| # Check for GPU communication health indicators | |
| # In a real implementation, this would check vLLM metrics for NCCL errors | |
| all_healthy = all(stat.gpu_util_percent > 0 or stat.memory_percent > 0 for stat in stats) | |
| if all_healthy: | |
| return ( | |
| '<div style="padding: 10px; background: #e8f5e9; border-radius: 5px;">' | |
| f'<span style="color: #2e7d32;">NCCL Status: Healthy ({len(stats)} GPUs)</span></div>' | |
| ) | |
| else: | |
| return ( | |
| '<div style="padding: 10px; background: #ffebee; border-radius: 5px;">' | |
| '<span style="color: #c62828;">NCCL Status: Communication issue detected</span></div>' | |
| ) | |