Ákos Hadnagy
commited on
Commit
·
b6b18a0
1
Parent(s):
954d017
UI improvements
Browse files- app.py +197 -44
- scenario_mappings.json +9 -9
app.py
CHANGED
|
@@ -31,6 +31,7 @@ class BenchmarkDashboard:
|
|
| 31 |
self.reader = BenchmarkDataReader()
|
| 32 |
self.df = None
|
| 33 |
self.scenario_mappings = self.load_scenario_mappings()
|
|
|
|
| 34 |
self.load_data()
|
| 35 |
|
| 36 |
def load_data(self) -> None:
|
|
@@ -72,16 +73,121 @@ class BenchmarkDashboard:
|
|
| 72 |
# If not found in mappings, assume it's already a raw name
|
| 73 |
return readable_name
|
| 74 |
|
| 75 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
"""Get unique values for filter dropdowns and date range."""
|
| 77 |
if self.df_pandas.empty:
|
| 78 |
-
return [], [], [], [], "", ""
|
| 79 |
|
| 80 |
models = sorted(self.df_pandas['model_name'].dropna().unique().tolist())
|
| 81 |
|
| 82 |
-
# Get scenarios with
|
| 83 |
raw_scenarios = sorted(self.df_pandas['scenario_name'].dropna().unique().tolist())
|
| 84 |
-
scenarios =
|
| 85 |
|
| 86 |
gpus = sorted(self.df_pandas['gpu_name'].dropna().unique().tolist())
|
| 87 |
|
|
@@ -122,9 +228,9 @@ class BenchmarkDashboard:
|
|
| 122 |
min_date = self.df_pandas['timestamp'].min().strftime('%Y-%m-%d')
|
| 123 |
max_date = self.df_pandas['timestamp'].max().strftime('%Y-%m-%d')
|
| 124 |
|
| 125 |
-
return models, scenarios, gpus, benchmark_runs, min_date, max_date
|
| 126 |
|
| 127 |
-
def filter_data(self,
|
| 128 |
selected_gpus: List[str], selected_run: str = None,
|
| 129 |
start_date: str = None, end_date: str = None) -> pd.DataFrame:
|
| 130 |
"""Filter data based on user selections."""
|
|
@@ -133,11 +239,12 @@ class BenchmarkDashboard:
|
|
| 133 |
|
| 134 |
filtered_df = self.df_pandas.copy()
|
| 135 |
|
| 136 |
-
if
|
| 137 |
-
filtered_df = filtered_df[filtered_df['model_name']
|
| 138 |
if selected_scenarios:
|
| 139 |
-
#
|
| 140 |
-
|
|
|
|
| 141 |
filtered_df = filtered_df[filtered_df['scenario_name'].isin(raw_scenarios)]
|
| 142 |
if selected_gpus:
|
| 143 |
filtered_df = filtered_df[filtered_df['gpu_name'].isin(selected_gpus)]
|
|
@@ -201,9 +308,9 @@ class BenchmarkDashboard:
|
|
| 201 |
x='scenario_display',
|
| 202 |
y=metric,
|
| 203 |
color='model_name',
|
| 204 |
-
title=f'Performance Comparison: {
|
| 205 |
labels={
|
| 206 |
-
metric:
|
| 207 |
'scenario_display': 'Benchmark Scenario',
|
| 208 |
'model_name': 'Model'
|
| 209 |
},
|
|
@@ -255,7 +362,7 @@ class BenchmarkDashboard:
|
|
| 255 |
hovertemplate=f'<b>{model}</b><br>' +
|
| 256 |
f'Scenario: {readable_scenario}<br>' +
|
| 257 |
'Time: %{x}<br>' +
|
| 258 |
-
f'{
|
| 259 |
'<extra></extra>'
|
| 260 |
))
|
| 261 |
|
|
@@ -269,9 +376,9 @@ class BenchmarkDashboard:
|
|
| 269 |
)
|
| 270 |
|
| 271 |
fig.update_layout(
|
| 272 |
-
title=f'Historical Trends Across Benchmark Runs: {
|
| 273 |
xaxis_title='Timestamp',
|
| 274 |
-
yaxis_title=
|
| 275 |
height=500,
|
| 276 |
hovermode='closest',
|
| 277 |
showlegend=True,
|
|
@@ -325,7 +432,7 @@ class BenchmarkDashboard:
|
|
| 325 |
return fig
|
| 326 |
|
| 327 |
def create_metrics_summary_table(self, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
| 328 |
-
"""Create summary statistics table."""
|
| 329 |
if filtered_df.empty:
|
| 330 |
return pd.DataFrame({'Message': ['No data available for selected filters']})
|
| 331 |
|
|
@@ -336,24 +443,34 @@ class BenchmarkDashboard:
|
|
| 336 |
]
|
| 337 |
|
| 338 |
summary_data = []
|
| 339 |
-
for model in filtered_df['model_name'].unique():
|
| 340 |
-
model_data = filtered_df[filtered_df['model_name'] == model]
|
| 341 |
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
for metric in metrics_cols:
|
| 344 |
-
if metric in
|
| 345 |
-
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
summary_data.append(row)
|
| 349 |
|
| 350 |
return pd.DataFrame(summary_data)
|
| 351 |
|
| 352 |
-
def update_dashboard(self,
|
| 353 |
selected_gpus: List[str], selected_run: str, metric: str):
|
| 354 |
"""Update all dashboard components based on current filters."""
|
| 355 |
filtered_df = self.filter_data(
|
| 356 |
-
|
| 357 |
)
|
| 358 |
|
| 359 |
# Create charts
|
|
@@ -363,23 +480,35 @@ class BenchmarkDashboard:
|
|
| 363 |
|
| 364 |
# Summary stats
|
| 365 |
if not filtered_df.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
summary_text = f"""
|
| 367 |
-
**
|
| 368 |
-
-
|
| 369 |
-
-
|
| 370 |
-
-
|
| 371 |
-
|
|
|
|
|
|
|
| 372 |
"""
|
| 373 |
else:
|
| 374 |
summary_text = "No data available for current selection."
|
| 375 |
|
| 376 |
return perf_chart, gpu_chart, summary_table, summary_text
|
| 377 |
|
| 378 |
-
def update_historical_trends(self,
|
| 379 |
selected_gpus: List[str], start_date: str, end_date: str, metric: str):
|
| 380 |
"""Update historical trends chart with date filtering."""
|
| 381 |
filtered_df = self.filter_data(
|
| 382 |
-
|
| 383 |
start_date=start_date, end_date=end_date
|
| 384 |
)
|
| 385 |
trend_chart = self.create_historical_trend_chart(filtered_df, metric)
|
|
@@ -389,34 +518,38 @@ class BenchmarkDashboard:
|
|
| 389 |
def create_gradio_interface() -> gr.Interface:
|
| 390 |
"""Create the Gradio interface."""
|
| 391 |
dashboard = BenchmarkDashboard()
|
| 392 |
-
models, scenarios, gpus, benchmark_runs, min_date, max_date = dashboard.get_filter_options()
|
| 393 |
|
| 394 |
-
# Performance metrics options
|
| 395 |
-
|
| 396 |
"tokens_per_second_mean",
|
| 397 |
"latency_seconds_mean",
|
| 398 |
"time_to_first_token_seconds_mean",
|
| 399 |
"time_per_output_token_seconds_mean"
|
| 400 |
]
|
|
|
|
| 401 |
|
| 402 |
with gr.Blocks(title="LLM Inference Performance Dashboard", theme=gr.themes.Soft()) as demo:
|
| 403 |
gr.Markdown("# 🚀 LLM Inference Performance Dashboard")
|
| 404 |
gr.Markdown("Analyze and compare LLM inference performance across models, scenarios, and hardware configurations.")
|
|
|
|
| 405 |
|
| 406 |
with gr.Row():
|
| 407 |
with gr.Column(scale=1):
|
| 408 |
gr.Markdown("## Filters")
|
| 409 |
|
| 410 |
-
model_filter = gr.
|
| 411 |
choices=models,
|
| 412 |
-
value=models,
|
| 413 |
-
label="Select
|
| 414 |
interactive=True
|
| 415 |
)
|
| 416 |
-
scenario_filter = gr.
|
| 417 |
choices=scenarios,
|
| 418 |
-
value=
|
| 419 |
label="Select Scenarios",
|
|
|
|
|
|
|
| 420 |
interactive=True
|
| 421 |
)
|
| 422 |
gpu_filter = gr.CheckboxGroup(
|
|
@@ -427,7 +560,7 @@ def create_gradio_interface() -> gr.Interface:
|
|
| 427 |
)
|
| 428 |
metric_selector = gr.Dropdown(
|
| 429 |
choices=metric_options,
|
| 430 |
-
value="tokens_per_second_mean",
|
| 431 |
label="Primary Metric",
|
| 432 |
interactive=True
|
| 433 |
)
|
|
@@ -494,16 +627,29 @@ def create_gradio_interface() -> gr.Interface:
|
|
| 494 |
filtered_runs = [run for run in benchmark_runs if search_text.lower() in run.lower()]
|
| 495 |
return gr.Dropdown(choices=filtered_runs, value=filtered_runs[0] if filtered_runs else None)
|
| 496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
# Update function for main dashboard (excluding historical trends)
|
| 498 |
-
def update_main(
|
|
|
|
|
|
|
| 499 |
return dashboard.update_dashboard(
|
| 500 |
-
|
| 501 |
)
|
| 502 |
|
| 503 |
# Update function for historical trends
|
| 504 |
-
def update_trends(
|
|
|
|
|
|
|
| 505 |
return dashboard.update_historical_trends(
|
| 506 |
-
|
| 507 |
)
|
| 508 |
|
| 509 |
# Set up interactivity for main dashboard
|
|
@@ -525,6 +671,13 @@ def create_gradio_interface() -> gr.Interface:
|
|
| 525 |
# Connect search field to filter benchmark runs
|
| 526 |
run_search.change(fn=filter_benchmark_runs, inputs=[run_search], outputs=[benchmark_run_selector])
|
| 527 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
# Initial load
|
| 529 |
demo.load(fn=update_main, inputs=main_inputs, outputs=main_outputs)
|
| 530 |
demo.load(fn=update_trends, inputs=trends_inputs, outputs=trends_outputs)
|
|
|
|
| 31 |
self.reader = BenchmarkDataReader()
|
| 32 |
self.df = None
|
| 33 |
self.scenario_mappings = self.load_scenario_mappings()
|
| 34 |
+
self.metric_mappings = self.get_metric_mappings()
|
| 35 |
self.load_data()
|
| 36 |
|
| 37 |
def load_data(self) -> None:
|
|
|
|
| 73 |
# If not found in mappings, assume it's already a raw name
|
| 74 |
return readable_name
|
| 75 |
|
| 76 |
+
def get_metric_mappings(self) -> Dict[str, str]:
|
| 77 |
+
"""Get metric name mappings from technical to human-readable names."""
|
| 78 |
+
return {
|
| 79 |
+
"tokens_per_second_mean": "Tokens per Second",
|
| 80 |
+
"latency_seconds_mean": "Latency (seconds)",
|
| 81 |
+
"time_to_first_token_seconds_mean": "Time to First Token (seconds)",
|
| 82 |
+
"time_per_output_token_seconds_mean": "Time per Output Token (seconds)"
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def get_readable_metric_name(self, metric_name: str) -> str:
|
| 86 |
+
"""Get human-readable metric name or return original if not mapped."""
|
| 87 |
+
return self.metric_mappings.get(metric_name, metric_name)
|
| 88 |
+
|
| 89 |
+
def get_raw_metric_name(self, readable_name: str) -> str:
|
| 90 |
+
"""Convert human-readable metric name back to raw metric name."""
|
| 91 |
+
for raw_name, mapped_name in self.metric_mappings.items():
|
| 92 |
+
if mapped_name == readable_name:
|
| 93 |
+
return raw_name
|
| 94 |
+
return readable_name
|
| 95 |
+
|
| 96 |
+
def get_best_scenario_for_model(self, model_name: str, metric: str = "tokens_per_second_mean") -> str:
|
| 97 |
+
"""Get the best performing scenario for a given model."""
|
| 98 |
+
if self.df_pandas.empty:
|
| 99 |
+
return ""
|
| 100 |
+
|
| 101 |
+
# Filter data for this model
|
| 102 |
+
model_data = self.df_pandas[self.df_pandas['model_name'] == model_name]
|
| 103 |
+
if model_data.empty:
|
| 104 |
+
return ""
|
| 105 |
+
|
| 106 |
+
# Define priority order for scenarios (preference for kernelized/compiled)
|
| 107 |
+
priority_order = [
|
| 108 |
+
"eager_sdpa_flash_attention",
|
| 109 |
+
"eager_sdpa_efficient_attention",
|
| 110 |
+
"compiled_compile_max-autotune_sdpa_efficient_attention",
|
| 111 |
+
"compiled_compile_max-autotune_sdpa_default",
|
| 112 |
+
"compiled_compile_max-autotune_sdpa_math",
|
| 113 |
+
"compiled_compile_max-autotune_eager_attn",
|
| 114 |
+
"eager_sdpa_default",
|
| 115 |
+
"eager_sdpa_math",
|
| 116 |
+
"eager_eager_attn"
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
# Check if metric exists
|
| 120 |
+
if metric not in model_data.columns:
|
| 121 |
+
# Fallback to first available scenario in priority order
|
| 122 |
+
for scenario in priority_order:
|
| 123 |
+
if scenario in model_data['scenario_name'].values:
|
| 124 |
+
return self.get_readable_scenario_name(scenario)
|
| 125 |
+
return self.get_readable_scenario_name(model_data['scenario_name'].iloc[0])
|
| 126 |
+
|
| 127 |
+
# Find best performing scenario (highest value for throughput metrics, lowest for latency)
|
| 128 |
+
is_latency_metric = 'latency' in metric.lower() or 'time' in metric.lower()
|
| 129 |
+
|
| 130 |
+
if is_latency_metric:
|
| 131 |
+
best_row = model_data.loc[model_data[metric].idxmin()]
|
| 132 |
+
else:
|
| 133 |
+
best_row = model_data.loc[model_data[metric].idxmax()]
|
| 134 |
+
|
| 135 |
+
return self.get_readable_scenario_name(best_row['scenario_name'])
|
| 136 |
+
|
| 137 |
+
def get_organized_scenarios(self, available_raw_scenarios: List[str]) -> Tuple[List[str], List[str]]:
|
| 138 |
+
"""Organize scenarios into priority groups with separators."""
|
| 139 |
+
# Define priority scenarios (main recommended scenarios)
|
| 140 |
+
priority_raw_scenarios = [
|
| 141 |
+
"eager_sdpa_flash_attention",
|
| 142 |
+
"compiled_compile_max-autotune_sdpa_default"
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
# Define expert/advanced scenarios (including efficient attention)
|
| 146 |
+
expert_raw_scenarios = [
|
| 147 |
+
"eager_sdpa_efficient_attention",
|
| 148 |
+
"compiled_compile_max-autotune_sdpa_efficient_attention",
|
| 149 |
+
"compiled_compile_max-autotune_eager_attn",
|
| 150 |
+
"compiled_compile_max-autotune_sdpa_math",
|
| 151 |
+
"eager_sdpa_default",
|
| 152 |
+
"eager_eager_attn",
|
| 153 |
+
"eager_sdpa_math"
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
# Get available scenarios in priority order
|
| 157 |
+
priority_scenarios = []
|
| 158 |
+
expert_scenarios = []
|
| 159 |
+
|
| 160 |
+
# Add priority scenarios that are available
|
| 161 |
+
for raw_scenario in priority_raw_scenarios:
|
| 162 |
+
if raw_scenario in available_raw_scenarios:
|
| 163 |
+
readable_name = self.get_readable_scenario_name(raw_scenario)
|
| 164 |
+
priority_scenarios.append(readable_name)
|
| 165 |
+
|
| 166 |
+
# Add expert scenarios that are available
|
| 167 |
+
for raw_scenario in expert_raw_scenarios:
|
| 168 |
+
if raw_scenario in available_raw_scenarios:
|
| 169 |
+
readable_name = self.get_readable_scenario_name(raw_scenario)
|
| 170 |
+
expert_scenarios.append(readable_name)
|
| 171 |
+
|
| 172 |
+
# Combine with separator
|
| 173 |
+
all_scenarios = priority_scenarios.copy()
|
| 174 |
+
if expert_scenarios:
|
| 175 |
+
all_scenarios.append("─── Advanced/Developer Options ───")
|
| 176 |
+
all_scenarios.extend(expert_scenarios)
|
| 177 |
+
|
| 178 |
+
# Return all scenarios (no default selections for multi-select anymore)
|
| 179 |
+
return all_scenarios, []
|
| 180 |
+
|
| 181 |
+
def get_filter_options(self) -> Tuple[List[str], List[str], List[str], List[str], List[str], str, str]:
|
| 182 |
"""Get unique values for filter dropdowns and date range."""
|
| 183 |
if self.df_pandas.empty:
|
| 184 |
+
return [], [], [], [], [], "", ""
|
| 185 |
|
| 186 |
models = sorted(self.df_pandas['model_name'].dropna().unique().tolist())
|
| 187 |
|
| 188 |
+
# Get organized scenarios with priority ordering and default selections
|
| 189 |
raw_scenarios = sorted(self.df_pandas['scenario_name'].dropna().unique().tolist())
|
| 190 |
+
scenarios, default_scenarios = self.get_organized_scenarios(raw_scenarios)
|
| 191 |
|
| 192 |
gpus = sorted(self.df_pandas['gpu_name'].dropna().unique().tolist())
|
| 193 |
|
|
|
|
| 228 |
min_date = self.df_pandas['timestamp'].min().strftime('%Y-%m-%d')
|
| 229 |
max_date = self.df_pandas['timestamp'].max().strftime('%Y-%m-%d')
|
| 230 |
|
| 231 |
+
return models, scenarios, gpus, benchmark_runs, default_scenarios, min_date, max_date
|
| 232 |
|
| 233 |
+
def filter_data(self, selected_model: str, selected_scenarios: List[str],
|
| 234 |
selected_gpus: List[str], selected_run: str = None,
|
| 235 |
start_date: str = None, end_date: str = None) -> pd.DataFrame:
|
| 236 |
"""Filter data based on user selections."""
|
|
|
|
| 239 |
|
| 240 |
filtered_df = self.df_pandas.copy()
|
| 241 |
|
| 242 |
+
if selected_model:
|
| 243 |
+
filtered_df = filtered_df[filtered_df['model_name'] == selected_model]
|
| 244 |
if selected_scenarios:
|
| 245 |
+
# Filter out separator lines and convert human-readable scenario names back to raw names for filtering
|
| 246 |
+
valid_scenarios = [scenario for scenario in selected_scenarios if not scenario.startswith("───")]
|
| 247 |
+
raw_scenarios = [self.get_raw_scenario_name(scenario) for scenario in valid_scenarios]
|
| 248 |
filtered_df = filtered_df[filtered_df['scenario_name'].isin(raw_scenarios)]
|
| 249 |
if selected_gpus:
|
| 250 |
filtered_df = filtered_df[filtered_df['gpu_name'].isin(selected_gpus)]
|
|
|
|
| 308 |
x='scenario_display',
|
| 309 |
y=metric,
|
| 310 |
color='model_name',
|
| 311 |
+
title=f'Performance Comparison: {self.get_readable_metric_name(metric)}',
|
| 312 |
labels={
|
| 313 |
+
metric: self.get_readable_metric_name(metric),
|
| 314 |
'scenario_display': 'Benchmark Scenario',
|
| 315 |
'model_name': 'Model'
|
| 316 |
},
|
|
|
|
| 362 |
hovertemplate=f'<b>{model}</b><br>' +
|
| 363 |
f'Scenario: {readable_scenario}<br>' +
|
| 364 |
'Time: %{x}<br>' +
|
| 365 |
+
f'{self.get_readable_metric_name(metric)}: %{{y}}<br>' +
|
| 366 |
'<extra></extra>'
|
| 367 |
))
|
| 368 |
|
|
|
|
| 376 |
)
|
| 377 |
|
| 378 |
fig.update_layout(
|
| 379 |
+
title=f'Historical Trends Across Benchmark Runs: {self.get_readable_metric_name(metric)}',
|
| 380 |
xaxis_title='Timestamp',
|
| 381 |
+
yaxis_title=self.get_readable_metric_name(metric),
|
| 382 |
height=500,
|
| 383 |
hovermode='closest',
|
| 384 |
showlegend=True,
|
|
|
|
| 432 |
return fig
|
| 433 |
|
| 434 |
def create_metrics_summary_table(self, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
| 435 |
+
"""Create summary statistics table with each scenario as a separate row."""
|
| 436 |
if filtered_df.empty:
|
| 437 |
return pd.DataFrame({'Message': ['No data available for selected filters']})
|
| 438 |
|
|
|
|
| 443 |
]
|
| 444 |
|
| 445 |
summary_data = []
|
|
|
|
|
|
|
| 446 |
|
| 447 |
+
# Group by scenario instead of model (since we're now single-model focused)
|
| 448 |
+
for scenario in filtered_df['scenario_name'].unique():
|
| 449 |
+
scenario_data = filtered_df[filtered_df['scenario_name'] == scenario]
|
| 450 |
+
|
| 451 |
+
# Get human-readable scenario name
|
| 452 |
+
readable_scenario = self.get_readable_scenario_name(scenario)
|
| 453 |
+
|
| 454 |
+
row = {'Scenario': readable_scenario}
|
| 455 |
+
|
| 456 |
+
# Add metrics for this scenario
|
| 457 |
for metric in metrics_cols:
|
| 458 |
+
if metric in scenario_data.columns and not scenario_data[metric].isna().all():
|
| 459 |
+
readable_metric = self.get_readable_metric_name(metric)
|
| 460 |
+
|
| 461 |
+
# For scenarios, show the mean value (since each scenario should have one value per run)
|
| 462 |
+
mean_value = scenario_data[metric].mean()
|
| 463 |
+
row[readable_metric] = f"{mean_value:.2f}"
|
| 464 |
|
| 465 |
summary_data.append(row)
|
| 466 |
|
| 467 |
return pd.DataFrame(summary_data)
|
| 468 |
|
| 469 |
+
def update_dashboard(self, selected_model: str, selected_scenarios: List[str],
|
| 470 |
selected_gpus: List[str], selected_run: str, metric: str):
|
| 471 |
"""Update all dashboard components based on current filters."""
|
| 472 |
filtered_df = self.filter_data(
|
| 473 |
+
selected_model, selected_scenarios, selected_gpus, selected_run
|
| 474 |
)
|
| 475 |
|
| 476 |
# Create charts
|
|
|
|
| 480 |
|
| 481 |
# Summary stats
|
| 482 |
if not filtered_df.empty:
|
| 483 |
+
model_name = filtered_df['model_name'].iloc[0]
|
| 484 |
+
|
| 485 |
+
# Get list of scenario names (raw) and convert to readable names
|
| 486 |
+
raw_scenario_names = sorted(filtered_df['scenario_name'].unique())
|
| 487 |
+
readable_scenario_names = [self.get_readable_scenario_name(scenario) for scenario in raw_scenario_names]
|
| 488 |
+
scenarios_list = ", ".join(readable_scenario_names)
|
| 489 |
+
|
| 490 |
+
date_range = f"{filtered_df['timestamp'].min().strftime('%Y-%m-%d')} to {filtered_df['timestamp'].max().strftime('%Y-%m-%d')}"
|
| 491 |
+
benchmark_runs = len(filtered_df.groupby(['timestamp', 'file_path']))
|
| 492 |
+
|
| 493 |
summary_text = f"""
|
| 494 |
+
**Analysis Summary for {model_name}:**
|
| 495 |
+
- Date Range: {date_range}
|
| 496 |
+
- Benchmark Runs: {benchmark_runs}
|
| 497 |
+
- Total Data Points: {len(filtered_df)}
|
| 498 |
+
|
| 499 |
+
**Selected Scenarios:**
|
| 500 |
+
{scenarios_list}
|
| 501 |
"""
|
| 502 |
else:
|
| 503 |
summary_text = "No data available for current selection."
|
| 504 |
|
| 505 |
return perf_chart, gpu_chart, summary_table, summary_text
|
| 506 |
|
| 507 |
+
def update_historical_trends(self, selected_model: str, selected_scenarios: List[str],
|
| 508 |
selected_gpus: List[str], start_date: str, end_date: str, metric: str):
|
| 509 |
"""Update historical trends chart with date filtering."""
|
| 510 |
filtered_df = self.filter_data(
|
| 511 |
+
selected_model, selected_scenarios, selected_gpus,
|
| 512 |
start_date=start_date, end_date=end_date
|
| 513 |
)
|
| 514 |
trend_chart = self.create_historical_trend_chart(filtered_df, metric)
|
|
|
|
| 518 |
def create_gradio_interface() -> gr.Interface:
|
| 519 |
"""Create the Gradio interface."""
|
| 520 |
dashboard = BenchmarkDashboard()
|
| 521 |
+
models, scenarios, gpus, benchmark_runs, default_scenarios, min_date, max_date = dashboard.get_filter_options()
|
| 522 |
|
| 523 |
+
# Performance metrics options (human-readable)
|
| 524 |
+
raw_metric_options = [
|
| 525 |
"tokens_per_second_mean",
|
| 526 |
"latency_seconds_mean",
|
| 527 |
"time_to_first_token_seconds_mean",
|
| 528 |
"time_per_output_token_seconds_mean"
|
| 529 |
]
|
| 530 |
+
metric_options = [dashboard.get_readable_metric_name(metric) for metric in raw_metric_options]
|
| 531 |
|
| 532 |
with gr.Blocks(title="LLM Inference Performance Dashboard", theme=gr.themes.Soft()) as demo:
|
| 533 |
gr.Markdown("# 🚀 LLM Inference Performance Dashboard")
|
| 534 |
gr.Markdown("Analyze and compare LLM inference performance across models, scenarios, and hardware configurations.")
|
| 535 |
+
gr.Markdown("*💡 **Smart Defaults**: The best performing scenario is automatically selected for each model based on throughput analysis.*")
|
| 536 |
|
| 537 |
with gr.Row():
|
| 538 |
with gr.Column(scale=1):
|
| 539 |
gr.Markdown("## Filters")
|
| 540 |
|
| 541 |
+
model_filter = gr.Dropdown(
|
| 542 |
choices=models,
|
| 543 |
+
value=models[0] if models else None,
|
| 544 |
+
label="Select Model",
|
| 545 |
interactive=True
|
| 546 |
)
|
| 547 |
+
scenario_filter = gr.Dropdown(
|
| 548 |
choices=scenarios,
|
| 549 |
+
value=[dashboard.get_best_scenario_for_model(models[0], "tokens_per_second_mean")] if models else [],
|
| 550 |
label="Select Scenarios",
|
| 551 |
+
info="💡 The best performing scenario is automatically selected when you change models",
|
| 552 |
+
multiselect=True,
|
| 553 |
interactive=True
|
| 554 |
)
|
| 555 |
gpu_filter = gr.CheckboxGroup(
|
|
|
|
| 560 |
)
|
| 561 |
metric_selector = gr.Dropdown(
|
| 562 |
choices=metric_options,
|
| 563 |
+
value=dashboard.get_readable_metric_name("tokens_per_second_mean"),
|
| 564 |
label="Primary Metric",
|
| 565 |
interactive=True
|
| 566 |
)
|
|
|
|
| 627 |
filtered_runs = [run for run in benchmark_runs if search_text.lower() in run.lower()]
|
| 628 |
return gr.Dropdown(choices=filtered_runs, value=filtered_runs[0] if filtered_runs else None)
|
| 629 |
|
| 630 |
+
# Function to update scenarios when model changes
|
| 631 |
+
def update_scenarios_for_model(selected_model, current_metric):
|
| 632 |
+
if not selected_model:
|
| 633 |
+
return []
|
| 634 |
+
# Convert readable metric name back to raw name
|
| 635 |
+
raw_metric = dashboard.get_raw_metric_name(current_metric)
|
| 636 |
+
best_scenario = dashboard.get_best_scenario_for_model(selected_model, raw_metric)
|
| 637 |
+
return [best_scenario] if best_scenario else []
|
| 638 |
+
|
| 639 |
# Update function for main dashboard (excluding historical trends)
|
| 640 |
+
def update_main(model_selected, scenarios_selected, gpus_selected, run_selected, metric):
|
| 641 |
+
# Convert readable metric name back to raw name
|
| 642 |
+
raw_metric = dashboard.get_raw_metric_name(metric)
|
| 643 |
return dashboard.update_dashboard(
|
| 644 |
+
model_selected, scenarios_selected, gpus_selected, run_selected, raw_metric
|
| 645 |
)
|
| 646 |
|
| 647 |
# Update function for historical trends
|
| 648 |
+
def update_trends(model_selected, scenarios_selected, gpus_selected, start_dt, end_dt, metric):
|
| 649 |
+
# Convert readable metric name back to raw name
|
| 650 |
+
raw_metric = dashboard.get_raw_metric_name(metric)
|
| 651 |
return dashboard.update_historical_trends(
|
| 652 |
+
model_selected, scenarios_selected, gpus_selected, start_dt, end_dt, raw_metric
|
| 653 |
)
|
| 654 |
|
| 655 |
# Set up interactivity for main dashboard
|
|
|
|
| 671 |
# Connect search field to filter benchmark runs
|
| 672 |
run_search.change(fn=filter_benchmark_runs, inputs=[run_search], outputs=[benchmark_run_selector])
|
| 673 |
|
| 674 |
+
# Auto-update scenarios when model changes
|
| 675 |
+
model_filter.change(
|
| 676 |
+
fn=update_scenarios_for_model,
|
| 677 |
+
inputs=[model_filter, metric_selector],
|
| 678 |
+
outputs=[scenario_filter]
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
# Initial load
|
| 682 |
demo.load(fn=update_main, inputs=main_inputs, outputs=main_outputs)
|
| 683 |
demo.load(fn=update_trends, inputs=trends_inputs, outputs=trends_outputs)
|
scenario_mappings.json
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"compiled_compile_max-
|
| 9 |
-
"compiled_compile_max-
|
| 10 |
-
"compiled_compile_max-
|
| 11 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"eager_sdpa_flash_attention": "Flash Attention",
|
| 3 |
+
"compiled_compile_max-autotune_sdpa_default": "Compiled + SDPA Default",
|
| 4 |
+
"eager_sdpa_default": "SDPA Default",
|
| 5 |
+
"eager_eager_attn": "Eager Attention",
|
| 6 |
+
"eager_sdpa_math": "SDPA Math Backend",
|
| 7 |
+
"eager_sdpa_efficient_attention": "Efficient Attention",
|
| 8 |
+
"compiled_compile_max-autotune_sdpa_efficient_attention": "Compiled + Efficient Attention",
|
| 9 |
+
"compiled_compile_max-autotune_eager_attn": "Compiled + Eager Attention",
|
| 10 |
+
"compiled_compile_max-autotune_sdpa_math": "Compiled + SDPA Math Backend"
|
| 11 |
}
|