|
|
|
|
|
|
|
|
import sys |
|
|
import dash |
|
|
from dash import dcc, html |
|
|
from dash.dependencies import Input, Output |
|
|
import plotly.graph_objs as go |
|
|
import nvidia_smi |
|
|
from collections import defaultdict, deque |
|
|
import dash_daq as daq |
|
|
|
|
|
|
|
|
max_history_length = 100 |
|
|
gpu_histories = defaultdict(lambda: { |
|
|
"time_history": deque(maxlen=max_history_length), |
|
|
"memory_history": deque(maxlen=max_history_length), |
|
|
"utilization_history": deque(maxlen=max_history_length), |
|
|
"memory_percentage_history": deque(maxlen=max_history_length) |
|
|
}) |
|
|
|
|
|
|
|
|
plot_template = "ggplot2" |
|
|
|
|
|
|
|
|
def get_gpu_metrics(device_id): |
|
|
allocated_memory = 0 |
|
|
free_memory = 0 |
|
|
total_memory = 0 |
|
|
gpu_utilization = 0 |
|
|
gpu_name = "Unknown GPU" |
|
|
try: |
|
|
nvidia_smi.nvmlInit() |
|
|
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id) |
|
|
mem_info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) |
|
|
util_info = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) |
|
|
gpu_name = nvidia_smi.nvmlDeviceGetName(handle) |
|
|
|
|
|
|
|
|
total_memory = mem_info.total / (1024 ** 3) |
|
|
free_memory = mem_info.free / (1024 ** 3) |
|
|
allocated_memory = (mem_info.total - mem_info.free) / (1024 ** 3) |
|
|
|
|
|
|
|
|
allocated_memory_MB = (mem_info.total - mem_info.free) / (1024 ** 2) |
|
|
|
|
|
gpu_utilization = util_info.gpu |
|
|
nvidia_smi.nvmlShutdown() |
|
|
except nvidia_smi.NVMLError as error: |
|
|
print(f"Error fetching GPU metrics: {error}") |
|
|
return allocated_memory, allocated_memory_MB, free_memory, total_memory, gpu_utilization, gpu_name |
|
|
|
|
|
|
|
|
def get_gpu_list(): |
|
|
gpu_list = [] |
|
|
try: |
|
|
nvidia_smi.nvmlInit() |
|
|
device_count = nvidia_smi.nvmlDeviceGetCount() |
|
|
for i in range(device_count): |
|
|
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) |
|
|
gpu_name = nvidia_smi.nvmlDeviceGetName(handle) |
|
|
gpu_list.append({"label": f"GPU {i} ({gpu_name})", "value": i}) |
|
|
nvidia_smi.nvmlShutdown() |
|
|
except nvidia_smi.NVMLError as error: |
|
|
print(f"Error fetching GPU list: {error}") |
|
|
return gpu_list |
|
|
|
|
|
|
|
|
app = dash.Dash(__name__) |
|
|
|
|
|
app.layout = html.Div([ |
|
|
html.H1("Real-Time GPU Metrics"), |
|
|
html.H4("Ashish"), |
|
|
html.Div([ |
|
|
html.Label("Select GPU:"), |
|
|
dcc.Dropdown( |
|
|
id='gpu-dropdown', |
|
|
options=get_gpu_list(), |
|
|
value=0 |
|
|
) |
|
|
]), |
|
|
html.Div([ |
|
|
html.H3("GPU Memory Usage (GB)"), |
|
|
html.Div([ |
|
|
html.Div([ |
|
|
html.Label("Current:"), |
|
|
daq.LEDDisplay(id='memory-current', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Min:"), |
|
|
daq.LEDDisplay(id='memory-min', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Max:"), |
|
|
daq.LEDDisplay(id='memory-max', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Avg:"), |
|
|
daq.LEDDisplay(id='memory-avg', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block'}) |
|
|
], style={'display': 'flex', 'margin-bottom': '20px'}), |
|
|
dcc.Graph(id='gpu-memory-graph') |
|
|
]), |
|
|
html.Div([ |
|
|
html.H3("GPU Memory Usage (%)"), |
|
|
html.Div([ |
|
|
html.Div([ |
|
|
html.Label("Current:"), |
|
|
daq.LEDDisplay(id='memory-percentage-current', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Min:"), |
|
|
daq.LEDDisplay(id='memory-percentage-min', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Max:"), |
|
|
daq.LEDDisplay(id='memory-percentage-max', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Avg:"), |
|
|
daq.LEDDisplay(id='memory-percentage-avg', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block'}) |
|
|
], style={'display': 'flex', 'margin-bottom': '20px'}), |
|
|
dcc.Graph(id='gpu-memory-percentage-graph') |
|
|
]), |
|
|
html.Div([ |
|
|
html.H3("GPU Utilization (%)"), |
|
|
html.Div([ |
|
|
html.Div([ |
|
|
html.Label("Current:"), |
|
|
daq.LEDDisplay(id='utilization-current', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Min:"), |
|
|
daq.LEDDisplay(id='utilization-min', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Max:"), |
|
|
daq.LEDDisplay(id='utilization-max', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block', 'margin-right': '20px'}), |
|
|
html.Div([ |
|
|
html.Label("Avg:"), |
|
|
daq.LEDDisplay(id='utilization-avg', value="0.00", size=20, color='#FF5E5E') |
|
|
], style={'display': 'inline-block'}) |
|
|
], style={'display': 'flex', 'margin-bottom': '20px'}), |
|
|
dcc.Graph(id='gpu-utilization-graph') |
|
|
]), |
|
|
dcc.Interval( |
|
|
id='interval-component', |
|
|
interval=1000, |
|
|
n_intervals=0 |
|
|
) |
|
|
]) |
|
|
|
|
|
@app.callback( |
|
|
[Output('gpu-memory-graph', 'figure'), |
|
|
Output('gpu-memory-percentage-graph', 'figure'), |
|
|
Output('gpu-utilization-graph', 'figure'), |
|
|
Output('memory-current', 'value'), |
|
|
Output('memory-min', 'value'), |
|
|
Output('memory-max', 'value'), |
|
|
Output('memory-avg', 'value'), |
|
|
Output('memory-percentage-current', 'value'), |
|
|
Output('memory-percentage-min', 'value'), |
|
|
Output('memory-percentage-max', 'value'), |
|
|
Output('memory-percentage-avg', 'value'), |
|
|
Output('utilization-current', 'value'), |
|
|
Output('utilization-min', 'value'), |
|
|
Output('utilization-max', 'value'), |
|
|
Output('utilization-avg', 'value')], |
|
|
[Input('interval-component', 'n_intervals'), |
|
|
Input('gpu-dropdown', 'value')] |
|
|
) |
|
|
def update_graphs(n, selected_gpu): |
|
|
|
|
|
allocated_memory, allocated_memory_MB, free_memory, total_memory, gpu_utilization, gpu_name = get_gpu_metrics(selected_gpu) |
|
|
|
|
|
if not allocated_memory: |
|
|
empty_figure = go.Figure( |
|
|
layout=dict( |
|
|
title="No GPU Available", |
|
|
xaxis=dict(visible=False), |
|
|
yaxis=dict(visible=False), |
|
|
annotations=[dict( |
|
|
text="No GPU detected!", |
|
|
xref="paper", |
|
|
yref="paper", |
|
|
showarrow=False, |
|
|
font=dict(size=20) |
|
|
)] |
|
|
) |
|
|
) |
|
|
return empty_figure, empty_figure, empty_figure, "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00" |
|
|
|
|
|
|
|
|
memory_percentage = (allocated_memory / total_memory) * 100 if total_memory > 0 else 0 |
|
|
|
|
|
|
|
|
gpu_history = gpu_histories[selected_gpu] |
|
|
gpu_history["time_history"].append(n) |
|
|
gpu_history["memory_history"].append((allocated_memory, free_memory, total_memory)) |
|
|
gpu_history["utilization_history"].append(gpu_utilization) |
|
|
gpu_history["memory_percentage_history"].append(memory_percentage) |
|
|
|
|
|
|
|
|
def calculate_stats(data): |
|
|
if not data: |
|
|
return 0, 0, 0 |
|
|
return min(data), max(data), sum(data) / len(data) |
|
|
|
|
|
|
|
|
allocated_memory_history = [mem[0] for mem in gpu_history["memory_history"]] |
|
|
min_allocated, max_allocated, avg_allocated = calculate_stats(allocated_memory_history) |
|
|
|
|
|
|
|
|
min_percentage, max_percentage, avg_percentage = calculate_stats(gpu_history["memory_percentage_history"]) |
|
|
|
|
|
|
|
|
min_utilization, max_utilization, avg_utilization = calculate_stats(gpu_history["utilization_history"]) |
|
|
|
|
|
|
|
|
memory_figure = go.Figure() |
|
|
memory_figure.add_trace(go.Scatter( |
|
|
x=list(gpu_history["time_history"]), |
|
|
y=allocated_memory_history, |
|
|
name="Allocated Memory (GB)", |
|
|
mode='lines', |
|
|
line=dict(color='blue') |
|
|
)) |
|
|
memory_figure.add_trace(go.Scatter( |
|
|
x=list(gpu_history["time_history"]), |
|
|
y=[mem[1] for mem in gpu_history["memory_history"]], |
|
|
name="Free Memory (GB)", |
|
|
mode='lines', |
|
|
line=dict(color='green') |
|
|
)) |
|
|
memory_figure.add_trace(go.Scatter( |
|
|
x=list(gpu_history["time_history"]), |
|
|
y=[mem[2] for mem in gpu_history["memory_history"]], |
|
|
name="Total Memory (GB)", |
|
|
mode='lines', |
|
|
line=dict(color='gray') |
|
|
)) |
|
|
memory_figure.update_layout( |
|
|
title=f"GPU Memory Usage Over Time ({gpu_name})", |
|
|
xaxis_title="Time (s)", |
|
|
yaxis_title="Memory (GB)", |
|
|
legend_title="Memory Type", |
|
|
template=plot_template, |
|
|
xaxis=dict(type='linear'), |
|
|
yaxis=dict(type='linear') |
|
|
) |
|
|
|
|
|
|
|
|
memory_percentage_figure = go.Figure() |
|
|
memory_percentage_figure.add_trace(go.Scatter( |
|
|
x=list(gpu_history["time_history"]), |
|
|
y=list(gpu_history["memory_percentage_history"]), |
|
|
name="Memory Usage (%)", |
|
|
mode='lines', |
|
|
line=dict(color='purple') |
|
|
)) |
|
|
memory_percentage_figure.update_layout( |
|
|
title=f"GPU Memory Usage Percentage Over Time ({gpu_name})", |
|
|
xaxis_title="Time (s)", |
|
|
yaxis_title="Memory Usage (%)", |
|
|
template=plot_template, |
|
|
xaxis=dict(type='linear'), |
|
|
yaxis=dict(type='linear', range=[0, 100]) |
|
|
) |
|
|
|
|
|
|
|
|
utilization_figure = go.Figure() |
|
|
utilization_figure.add_trace(go.Scatter( |
|
|
x=list(gpu_history["time_history"]), |
|
|
y=list(gpu_history["utilization_history"]), |
|
|
name="GPU Utilization (%)", |
|
|
mode='lines', |
|
|
line=dict(color='orange') |
|
|
)) |
|
|
utilization_figure.update_layout( |
|
|
title=f"GPU Utilization Over Time ({gpu_name})", |
|
|
xaxis_title="Time (s)", |
|
|
yaxis_title="Utilization (%)", |
|
|
template=plot_template, |
|
|
xaxis=dict(type='linear'), |
|
|
yaxis=dict(type='linear', range=[0, 100]) |
|
|
) |
|
|
|
|
|
|
|
|
return ( |
|
|
memory_figure, |
|
|
memory_percentage_figure, |
|
|
utilization_figure, |
|
|
f"{allocated_memory_MB:.2f}", |
|
|
f"{min_allocated:.2f}", |
|
|
f"{max_allocated:.2f}", |
|
|
f"{avg_allocated:.2f}", |
|
|
f"{memory_percentage:.2f}", |
|
|
f"{min_percentage:.2f}", |
|
|
f"{max_percentage:.2f}", |
|
|
f"{avg_percentage:.2f}", |
|
|
f"{gpu_utilization:.2f}", |
|
|
f"{min_utilization:.2f}", |
|
|
f"{max_utilization:.2f}", |
|
|
f"{avg_utilization:.2f}" |
|
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
args = sys.argv |
|
|
app.run_server(host=f'10.119.2.{args[1]}', port=8050, debug=True) |