alpha31476's picture
LDM-train-pass, checking results
87ef7b5 verified
# pip install dash plotly nvidia-ml-py dash-daq
import sys
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import nvidia_smi
from collections import defaultdict, deque
import dash_daq as daq # For LED displays
# Initialize deques to store historical data for each GPU
max_history_length = 100 # Number of data points to store
gpu_histories = defaultdict(lambda: {
"time_history": deque(maxlen=max_history_length),
"memory_history": deque(maxlen=max_history_length),
"utilization_history": deque(maxlen=max_history_length),
"memory_percentage_history": deque(maxlen=max_history_length) # New: Percentage memory history
})
# Plot template
plot_template = "ggplot2"
# Function to get GPU memory usage, utilization, and name
def get_gpu_metrics(device_id):
allocated_memory = 0
free_memory = 0
total_memory = 0
gpu_utilization = 0
gpu_name = "Unknown GPU"
try:
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
mem_info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
util_info = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
gpu_name = nvidia_smi.nvmlDeviceGetName(handle)
# in GB
total_memory = mem_info.total / (1024 ** 3) # Convert to GB
free_memory = mem_info.free / (1024 ** 3) # Convert to GB
allocated_memory = (mem_info.total - mem_info.free) / (1024 ** 3) # Convert to GB
# in MB
allocated_memory_MB = (mem_info.total - mem_info.free) / (1024 ** 2) # Convert to GB
gpu_utilization = util_info.gpu # GPU utilization in percentage
nvidia_smi.nvmlShutdown()
except nvidia_smi.NVMLError as error:
print(f"Error fetching GPU metrics: {error}")
return allocated_memory, allocated_memory_MB, free_memory, total_memory, gpu_utilization, gpu_name
# Get list of available GPUs
def get_gpu_list():
gpu_list = []
try:
nvidia_smi.nvmlInit()
device_count = nvidia_smi.nvmlDeviceGetCount()
for i in range(device_count):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
gpu_name = nvidia_smi.nvmlDeviceGetName(handle)
gpu_list.append({"label": f"GPU {i} ({gpu_name})", "value": i})
nvidia_smi.nvmlShutdown()
except nvidia_smi.NVMLError as error:
print(f"Error fetching GPU list: {error}")
return gpu_list
# Dash app setup
app = dash.Dash(__name__)
app.layout = html.Div([
html.H1("Real-Time GPU Metrics"),
html.H4("Ashish"),
html.Div([
html.Label("Select GPU:"),
dcc.Dropdown(
id='gpu-dropdown',
options=get_gpu_list(),
value=0 # Default to the first GPU
)
]),
html.Div([
html.H3("GPU Memory Usage (GB)"),
html.Div([
html.Div([
html.Label("Current:"),
daq.LEDDisplay(id='memory-current', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Min:"),
daq.LEDDisplay(id='memory-min', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Max:"),
daq.LEDDisplay(id='memory-max', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Avg:"),
daq.LEDDisplay(id='memory-avg', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block'})
], style={'display': 'flex', 'margin-bottom': '20px'}),
dcc.Graph(id='gpu-memory-graph')
]),
html.Div([
html.H3("GPU Memory Usage (%)"),
html.Div([
html.Div([
html.Label("Current:"),
daq.LEDDisplay(id='memory-percentage-current', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Min:"),
daq.LEDDisplay(id='memory-percentage-min', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Max:"),
daq.LEDDisplay(id='memory-percentage-max', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Avg:"),
daq.LEDDisplay(id='memory-percentage-avg', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block'})
], style={'display': 'flex', 'margin-bottom': '20px'}),
dcc.Graph(id='gpu-memory-percentage-graph')
]),
html.Div([
html.H3("GPU Utilization (%)"),
html.Div([
html.Div([
html.Label("Current:"),
daq.LEDDisplay(id='utilization-current', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Min:"),
daq.LEDDisplay(id='utilization-min', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Max:"),
daq.LEDDisplay(id='utilization-max', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block', 'margin-right': '20px'}),
html.Div([
html.Label("Avg:"),
daq.LEDDisplay(id='utilization-avg', value="0.00", size=20, color='#FF5E5E')
], style={'display': 'inline-block'})
], style={'display': 'flex', 'margin-bottom': '20px'}),
dcc.Graph(id='gpu-utilization-graph')
]),
dcc.Interval(
id='interval-component',
interval=1000, # Update every 0.5 seconds
n_intervals=0
)
])
@app.callback(
[Output('gpu-memory-graph', 'figure'),
Output('gpu-memory-percentage-graph', 'figure'),
Output('gpu-utilization-graph', 'figure'),
Output('memory-current', 'value'),
Output('memory-min', 'value'),
Output('memory-max', 'value'),
Output('memory-avg', 'value'),
Output('memory-percentage-current', 'value'),
Output('memory-percentage-min', 'value'),
Output('memory-percentage-max', 'value'),
Output('memory-percentage-avg', 'value'),
Output('utilization-current', 'value'),
Output('utilization-min', 'value'),
Output('utilization-max', 'value'),
Output('utilization-avg', 'value')],
[Input('interval-component', 'n_intervals'),
Input('gpu-dropdown', 'value')]
)
def update_graphs(n, selected_gpu):
# Get current GPU metrics
allocated_memory, allocated_memory_MB, free_memory, total_memory, gpu_utilization, gpu_name = get_gpu_metrics(selected_gpu)
if not allocated_memory:
empty_figure = go.Figure(
layout=dict(
title="No GPU Available",
xaxis=dict(visible=False),
yaxis=dict(visible=False),
annotations=[dict(
text="No GPU detected!",
xref="paper",
yref="paper",
showarrow=False,
font=dict(size=20)
)]
)
)
return empty_figure, empty_figure, empty_figure, "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00"
# Calculate percentage of allocated memory
memory_percentage = (allocated_memory / total_memory) * 100 if total_memory > 0 else 0
# Update historical data for the selected GPU
gpu_history = gpu_histories[selected_gpu]
gpu_history["time_history"].append(n) # Use interval count as a proxy for time
gpu_history["memory_history"].append((allocated_memory, free_memory, total_memory))
gpu_history["utilization_history"].append(gpu_utilization)
gpu_history["memory_percentage_history"].append(memory_percentage)
# Helper function to calculate min, max, and average
def calculate_stats(data):
if not data:
return 0, 0, 0
return min(data), max(data), sum(data) / len(data)
# Calculate stats for memory usage
allocated_memory_history = [mem[0] for mem in gpu_history["memory_history"]]
min_allocated, max_allocated, avg_allocated = calculate_stats(allocated_memory_history)
# Calculate stats for memory percentage
min_percentage, max_percentage, avg_percentage = calculate_stats(gpu_history["memory_percentage_history"])
# Calculate stats for GPU utilization
min_utilization, max_utilization, avg_utilization = calculate_stats(gpu_history["utilization_history"])
# Create the memory usage line plot
memory_figure = go.Figure()
memory_figure.add_trace(go.Scatter(
x=list(gpu_history["time_history"]),
y=allocated_memory_history, # Allocated memory
name="Allocated Memory (GB)",
mode='lines',
line=dict(color='blue')
))
memory_figure.add_trace(go.Scatter(
x=list(gpu_history["time_history"]),
y=[mem[1] for mem in gpu_history["memory_history"]], # Free memory
name="Free Memory (GB)",
mode='lines',
line=dict(color='green')
))
memory_figure.add_trace(go.Scatter(
x=list(gpu_history["time_history"]),
y=[mem[2] for mem in gpu_history["memory_history"]], # Total memory
name="Total Memory (GB)",
mode='lines',
line=dict(color='gray')
))
memory_figure.update_layout(
title=f"GPU Memory Usage Over Time ({gpu_name})",
xaxis_title="Time (s)",
yaxis_title="Memory (GB)",
legend_title="Memory Type",
template=plot_template,
xaxis=dict(type='linear'),
yaxis=dict(type='linear')
)
# Create the percentage GPU memory line plot
memory_percentage_figure = go.Figure()
memory_percentage_figure.add_trace(go.Scatter(
x=list(gpu_history["time_history"]),
y=list(gpu_history["memory_percentage_history"]),
name="Memory Usage (%)",
mode='lines',
line=dict(color='purple')
))
memory_percentage_figure.update_layout(
title=f"GPU Memory Usage Percentage Over Time ({gpu_name})",
xaxis_title="Time (s)",
yaxis_title="Memory Usage (%)",
template=plot_template,
xaxis=dict(type='linear'),
yaxis=dict(type='linear', range=[0, 100])
)
# Create the GPU utilization line plot
utilization_figure = go.Figure()
utilization_figure.add_trace(go.Scatter(
x=list(gpu_history["time_history"]),
y=list(gpu_history["utilization_history"]),
name="GPU Utilization (%)",
mode='lines',
line=dict(color='orange')
))
utilization_figure.update_layout(
title=f"GPU Utilization Over Time ({gpu_name})",
xaxis_title="Time (s)",
yaxis_title="Utilization (%)",
template=plot_template,
xaxis=dict(type='linear'),
yaxis=dict(type='linear', range=[0, 100])
)
# Return figures and LED display values
return (
memory_figure,
memory_percentage_figure,
utilization_figure,
f"{allocated_memory_MB:.2f}",
f"{min_allocated:.2f}",
f"{max_allocated:.2f}",
f"{avg_allocated:.2f}",
f"{memory_percentage:.2f}",
f"{min_percentage:.2f}",
f"{max_percentage:.2f}",
f"{avg_percentage:.2f}",
f"{gpu_utilization:.2f}",
f"{min_utilization:.2f}",
f"{max_utilization:.2f}",
f"{avg_utilization:.2f}"
)
if __name__ == '__main__':
args = sys.argv
app.run_server(host=f'10.119.2.{args[1]}', port=8050, debug=True)