Spaces:

ahadnagy
/

inference-performance-dashboard

Sleeping

App Files Files Community

Ákos Hadnagy commited on Sep 22, 2025

Commit

b6b18a0

1 Parent(s): 954d017

UI improvements

Browse files

Files changed (2) hide show

app.py +197 -44
scenario_mappings.json +9 -9

app.py CHANGED Viewed

@@ -31,6 +31,7 @@ class BenchmarkDashboard:
         self.reader = BenchmarkDataReader()
         self.df = None
         self.scenario_mappings = self.load_scenario_mappings()
         self.load_data()
     def load_data(self) -> None:
@@ -72,16 +73,121 @@ class BenchmarkDashboard:
         # If not found in mappings, assume it's already a raw name
         return readable_name
-    def get_filter_options(self) -> Tuple[List[str], List[str], List[str], List[str], str, str]:
         """Get unique values for filter dropdowns and date range."""
         if self.df_pandas.empty:
-            return [], [], [], [], "", ""
         models = sorted(self.df_pandas['model_name'].dropna().unique().tolist())
-        # Get scenarios with human-readable names for display
         raw_scenarios = sorted(self.df_pandas['scenario_name'].dropna().unique().tolist())
-        scenarios = [self.get_readable_scenario_name(scenario) for scenario in raw_scenarios]
         gpus = sorted(self.df_pandas['gpu_name'].dropna().unique().tolist())
@@ -122,9 +228,9 @@ class BenchmarkDashboard:
         min_date = self.df_pandas['timestamp'].min().strftime('%Y-%m-%d')
         max_date = self.df_pandas['timestamp'].max().strftime('%Y-%m-%d')
-        return models, scenarios, gpus, benchmark_runs, min_date, max_date
-    def filter_data(self, selected_models: List[str], selected_scenarios: List[str],
                    selected_gpus: List[str], selected_run: str = None,
                    start_date: str = None, end_date: str = None) -> pd.DataFrame:
         """Filter data based on user selections."""
@@ -133,11 +239,12 @@ class BenchmarkDashboard:
         filtered_df = self.df_pandas.copy()
-        if selected_models:
-            filtered_df = filtered_df[filtered_df['model_name'].isin(selected_models)]
         if selected_scenarios:
-            # Convert human-readable scenario names back to raw names for filtering
-            raw_scenarios = [self.get_raw_scenario_name(scenario) for scenario in selected_scenarios]
             filtered_df = filtered_df[filtered_df['scenario_name'].isin(raw_scenarios)]
         if selected_gpus:
             filtered_df = filtered_df[filtered_df['gpu_name'].isin(selected_gpus)]
@@ -201,9 +308,9 @@ class BenchmarkDashboard:
             x='scenario_display',
             y=metric,
             color='model_name',
-            title=f'Performance Comparison: {metric.replace("_", " ").title()}',
             labels={
-                metric: metric.replace("_", " ").title(),
                 'scenario_display': 'Benchmark Scenario',
                 'model_name': 'Model'
             },
@@ -255,7 +362,7 @@ class BenchmarkDashboard:
                         hovertemplate=f'<b>{model}</b><br>' +
                                      f'Scenario: {readable_scenario}<br>' +
                                      'Time: %{x}<br>' +
-                                     f'{metric.replace("_", " ").title()}: %{{y}}<br>' +
                                      '<extra></extra>'
                     ))
@@ -269,9 +376,9 @@ class BenchmarkDashboard:
             )
         fig.update_layout(
-            title=f'Historical Trends Across Benchmark Runs: {metric.replace("_", " ").title()}',
             xaxis_title='Timestamp',
-            yaxis_title=metric.replace("_", " ").title(),
             height=500,
             hovermode='closest',
             showlegend=True,
@@ -325,7 +432,7 @@ class BenchmarkDashboard:
         return fig
     def create_metrics_summary_table(self, filtered_df: pd.DataFrame) -> pd.DataFrame:
-        """Create summary statistics table."""
         if filtered_df.empty:
             return pd.DataFrame({'Message': ['No data available for selected filters']})
@@ -336,24 +443,34 @@ class BenchmarkDashboard:
         ]
         summary_data = []
-        for model in filtered_df['model_name'].unique():
-            model_data = filtered_df[filtered_df['model_name'] == model]
-            row = {'Model': model, 'Scenarios': len(model_data)}
             for metric in metrics_cols:
-                if metric in model_data.columns:
-                    row[f'{metric.replace("_", " ").title()} (Avg)'] = f"{model_data[metric].mean():.2f}"
-                    row[f'{metric.replace("_", " ").title()} (Best)'] = f"{model_data[metric].min() if 'latency' in metric or 'time' in metric else model_data[metric].max():.2f}"
             summary_data.append(row)
         return pd.DataFrame(summary_data)
-    def update_dashboard(self, selected_models: List[str], selected_scenarios: List[str],
                         selected_gpus: List[str], selected_run: str, metric: str):
         """Update all dashboard components based on current filters."""
         filtered_df = self.filter_data(
-            selected_models, selected_scenarios, selected_gpus, selected_run
         )
         # Create charts
@@ -363,23 +480,35 @@ class BenchmarkDashboard:
         # Summary stats
         if not filtered_df.empty:
             summary_text = f"""
-            **Data Summary:**
-            - Total Scenarios: {len(filtered_df)}
-            - Models: {', '.join(filtered_df['model_name'].unique())}
-            - Date Range: {filtered_df['timestamp'].min().strftime('%Y-%m-%d')} to {filtered_df['timestamp'].max().strftime('%Y-%m-%d')}
-            - Benchmark Runs: {len(filtered_df.groupby(['timestamp', 'file_path']))}
             """
         else:
             summary_text = "No data available for current selection."
         return perf_chart, gpu_chart, summary_table, summary_text
-    def update_historical_trends(self, selected_models: List[str], selected_scenarios: List[str],
                                 selected_gpus: List[str], start_date: str, end_date: str, metric: str):
         """Update historical trends chart with date filtering."""
         filtered_df = self.filter_data(
-            selected_models, selected_scenarios, selected_gpus,
             start_date=start_date, end_date=end_date
         )
         trend_chart = self.create_historical_trend_chart(filtered_df, metric)
@@ -389,34 +518,38 @@ class BenchmarkDashboard:
 def create_gradio_interface() -> gr.Interface:
     """Create the Gradio interface."""
     dashboard = BenchmarkDashboard()
-    models, scenarios, gpus, benchmark_runs, min_date, max_date = dashboard.get_filter_options()
-    # Performance metrics options
-    metric_options = [
         "tokens_per_second_mean",
         "latency_seconds_mean",
         "time_to_first_token_seconds_mean",
         "time_per_output_token_seconds_mean"
     ]
     with gr.Blocks(title="LLM Inference Performance Dashboard", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🚀 LLM Inference Performance Dashboard")
         gr.Markdown("Analyze and compare LLM inference performance across models, scenarios, and hardware configurations.")
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("## Filters")
-                model_filter = gr.CheckboxGroup(
                     choices=models,
-                    value=models,
-                    label="Select Models",
                     interactive=True
                 )
-                scenario_filter = gr.CheckboxGroup(
                     choices=scenarios,
-                    value=scenarios[:5] if len(scenarios) > 5 else scenarios,  # Limit initial selection
                     label="Select Scenarios",
                     interactive=True
                 )
                 gpu_filter = gr.CheckboxGroup(
@@ -427,7 +560,7 @@ def create_gradio_interface() -> gr.Interface:
                 )
                 metric_selector = gr.Dropdown(
                     choices=metric_options,
-                    value="tokens_per_second_mean",
                     label="Primary Metric",
                     interactive=True
                 )
@@ -494,16 +627,29 @@ def create_gradio_interface() -> gr.Interface:
             filtered_runs = [run for run in benchmark_runs if search_text.lower() in run.lower()]
             return gr.Dropdown(choices=filtered_runs, value=filtered_runs[0] if filtered_runs else None)
         # Update function for main dashboard (excluding historical trends)
-        def update_main(models_selected, scenarios_selected, gpus_selected, run_selected, metric):
             return dashboard.update_dashboard(
-                models_selected, scenarios_selected, gpus_selected, run_selected, metric
             )
         # Update function for historical trends
-        def update_trends(models_selected, scenarios_selected, gpus_selected, start_dt, end_dt, metric):
             return dashboard.update_historical_trends(
-                models_selected, scenarios_selected, gpus_selected, start_dt, end_dt, metric
             )
         # Set up interactivity for main dashboard
@@ -525,6 +671,13 @@ def create_gradio_interface() -> gr.Interface:
         # Connect search field to filter benchmark runs
         run_search.change(fn=filter_benchmark_runs, inputs=[run_search], outputs=[benchmark_run_selector])
         # Initial load
         demo.load(fn=update_main, inputs=main_inputs, outputs=main_outputs)
         demo.load(fn=update_trends, inputs=trends_inputs, outputs=trends_outputs)

         self.reader = BenchmarkDataReader()
         self.df = None
         self.scenario_mappings = self.load_scenario_mappings()
+        self.metric_mappings = self.get_metric_mappings()
         self.load_data()
     def load_data(self) -> None:
         # If not found in mappings, assume it's already a raw name
         return readable_name
+    def get_metric_mappings(self) -> Dict[str, str]:
+        """Get metric name mappings from technical to human-readable names."""
+        return {
+            "tokens_per_second_mean": "Tokens per Second",
+            "latency_seconds_mean": "Latency (seconds)",
+            "time_to_first_token_seconds_mean": "Time to First Token (seconds)",
+            "time_per_output_token_seconds_mean": "Time per Output Token (seconds)"
+        }
+    def get_readable_metric_name(self, metric_name: str) -> str:
+        """Get human-readable metric name or return original if not mapped."""
+        return self.metric_mappings.get(metric_name, metric_name)
+    def get_raw_metric_name(self, readable_name: str) -> str:
+        """Convert human-readable metric name back to raw metric name."""
+        for raw_name, mapped_name in self.metric_mappings.items():
+            if mapped_name == readable_name:
+                return raw_name
+        return readable_name
+    def get_best_scenario_for_model(self, model_name: str, metric: str = "tokens_per_second_mean") -> str:
+        """Get the best performing scenario for a given model."""
+        if self.df_pandas.empty:
+            return ""
+        # Filter data for this model
+        model_data = self.df_pandas[self.df_pandas['model_name'] == model_name]
+        if model_data.empty:
+            return ""
+        # Define priority order for scenarios (preference for kernelized/compiled)
+        priority_order = [
+            "eager_sdpa_flash_attention",
+            "eager_sdpa_efficient_attention",
+            "compiled_compile_max-autotune_sdpa_efficient_attention",
+            "compiled_compile_max-autotune_sdpa_default",
+            "compiled_compile_max-autotune_sdpa_math",
+            "compiled_compile_max-autotune_eager_attn",
+            "eager_sdpa_default",
+            "eager_sdpa_math",
+            "eager_eager_attn"
+        ]
+        # Check if metric exists
+        if metric not in model_data.columns:
+            # Fallback to first available scenario in priority order
+            for scenario in priority_order:
+                if scenario in model_data['scenario_name'].values:
+                    return self.get_readable_scenario_name(scenario)
+            return self.get_readable_scenario_name(model_data['scenario_name'].iloc[0])
+        # Find best performing scenario (highest value for throughput metrics, lowest for latency)
+        is_latency_metric = 'latency' in metric.lower() or 'time' in metric.lower()
+        if is_latency_metric:
+            best_row = model_data.loc[model_data[metric].idxmin()]
+        else:
+            best_row = model_data.loc[model_data[metric].idxmax()]
+        return self.get_readable_scenario_name(best_row['scenario_name'])
+    def get_organized_scenarios(self, available_raw_scenarios: List[str]) -> Tuple[List[str], List[str]]:
+        """Organize scenarios into priority groups with separators."""
+        # Define priority scenarios (main recommended scenarios)
+        priority_raw_scenarios = [
+            "eager_sdpa_flash_attention",
+            "compiled_compile_max-autotune_sdpa_default"
+        ]
+        # Define expert/advanced scenarios (including efficient attention)
+        expert_raw_scenarios = [
+            "eager_sdpa_efficient_attention",
+            "compiled_compile_max-autotune_sdpa_efficient_attention",
+            "compiled_compile_max-autotune_eager_attn",
+            "compiled_compile_max-autotune_sdpa_math",
+            "eager_sdpa_default",
+            "eager_eager_attn",
+            "eager_sdpa_math"
+        ]
+        # Get available scenarios in priority order
+        priority_scenarios = []
+        expert_scenarios = []
+        # Add priority scenarios that are available
+        for raw_scenario in priority_raw_scenarios:
+            if raw_scenario in available_raw_scenarios:
+                readable_name = self.get_readable_scenario_name(raw_scenario)
+                priority_scenarios.append(readable_name)
+        # Add expert scenarios that are available
+        for raw_scenario in expert_raw_scenarios:
+            if raw_scenario in available_raw_scenarios:
+                readable_name = self.get_readable_scenario_name(raw_scenario)
+                expert_scenarios.append(readable_name)
+        # Combine with separator
+        all_scenarios = priority_scenarios.copy()
+        if expert_scenarios:
+            all_scenarios.append("─── Advanced/Developer Options ───")
+            all_scenarios.extend(expert_scenarios)
+        # Return all scenarios (no default selections for multi-select anymore)
+        return all_scenarios, []
+    def get_filter_options(self) -> Tuple[List[str], List[str], List[str], List[str], List[str], str, str]:
         """Get unique values for filter dropdowns and date range."""
         if self.df_pandas.empty:
+            return [], [], [], [], [], "", ""
         models = sorted(self.df_pandas['model_name'].dropna().unique().tolist())
+        # Get organized scenarios with priority ordering and default selections
         raw_scenarios = sorted(self.df_pandas['scenario_name'].dropna().unique().tolist())
+        scenarios, default_scenarios = self.get_organized_scenarios(raw_scenarios)
         gpus = sorted(self.df_pandas['gpu_name'].dropna().unique().tolist())
         min_date = self.df_pandas['timestamp'].min().strftime('%Y-%m-%d')
         max_date = self.df_pandas['timestamp'].max().strftime('%Y-%m-%d')
+        return models, scenarios, gpus, benchmark_runs, default_scenarios, min_date, max_date
+    def filter_data(self, selected_model: str, selected_scenarios: List[str],
                    selected_gpus: List[str], selected_run: str = None,
                    start_date: str = None, end_date: str = None) -> pd.DataFrame:
         """Filter data based on user selections."""
         filtered_df = self.df_pandas.copy()
+        if selected_model:
+            filtered_df = filtered_df[filtered_df['model_name'] == selected_model]
         if selected_scenarios:
+            # Filter out separator lines and convert human-readable scenario names back to raw names for filtering
+            valid_scenarios = [scenario for scenario in selected_scenarios if not scenario.startswith("───")]
+            raw_scenarios = [self.get_raw_scenario_name(scenario) for scenario in valid_scenarios]
             filtered_df = filtered_df[filtered_df['scenario_name'].isin(raw_scenarios)]
         if selected_gpus:
             filtered_df = filtered_df[filtered_df['gpu_name'].isin(selected_gpus)]
             x='scenario_display',
             y=metric,
             color='model_name',
+            title=f'Performance Comparison: {self.get_readable_metric_name(metric)}',
             labels={
+                metric: self.get_readable_metric_name(metric),
                 'scenario_display': 'Benchmark Scenario',
                 'model_name': 'Model'
             },
                         hovertemplate=f'<b>{model}</b><br>' +
                                      f'Scenario: {readable_scenario}<br>' +
                                      'Time: %{x}<br>' +
+                                     f'{self.get_readable_metric_name(metric)}: %{{y}}<br>' +
                                      '<extra></extra>'
                     ))
             )
         fig.update_layout(
+            title=f'Historical Trends Across Benchmark Runs: {self.get_readable_metric_name(metric)}',
             xaxis_title='Timestamp',
+            yaxis_title=self.get_readable_metric_name(metric),
             height=500,
             hovermode='closest',
             showlegend=True,
         return fig
     def create_metrics_summary_table(self, filtered_df: pd.DataFrame) -> pd.DataFrame:
+        """Create summary statistics table with each scenario as a separate row."""
         if filtered_df.empty:
             return pd.DataFrame({'Message': ['No data available for selected filters']})
         ]
         summary_data = []
+        # Group by scenario instead of model (since we're now single-model focused)
+        for scenario in filtered_df['scenario_name'].unique():
+            scenario_data = filtered_df[filtered_df['scenario_name'] == scenario]
+            # Get human-readable scenario name
+            readable_scenario = self.get_readable_scenario_name(scenario)
+            row = {'Scenario': readable_scenario}
+            # Add metrics for this scenario
             for metric in metrics_cols:
+                if metric in scenario_data.columns and not scenario_data[metric].isna().all():
+                    readable_metric = self.get_readable_metric_name(metric)
+                    # For scenarios, show the mean value (since each scenario should have one value per run)
+                    mean_value = scenario_data[metric].mean()
+                    row[readable_metric] = f"{mean_value:.2f}"
             summary_data.append(row)
         return pd.DataFrame(summary_data)
+    def update_dashboard(self, selected_model: str, selected_scenarios: List[str],
                         selected_gpus: List[str], selected_run: str, metric: str):
         """Update all dashboard components based on current filters."""
         filtered_df = self.filter_data(
+            selected_model, selected_scenarios, selected_gpus, selected_run
         )
         # Create charts
         # Summary stats
         if not filtered_df.empty:
+            model_name = filtered_df['model_name'].iloc[0]
+            # Get list of scenario names (raw) and convert to readable names
+            raw_scenario_names = sorted(filtered_df['scenario_name'].unique())
+            readable_scenario_names = [self.get_readable_scenario_name(scenario) for scenario in raw_scenario_names]
+            scenarios_list = ", ".join(readable_scenario_names)
+            date_range = f"{filtered_df['timestamp'].min().strftime('%Y-%m-%d')} to {filtered_df['timestamp'].max().strftime('%Y-%m-%d')}"
+            benchmark_runs = len(filtered_df.groupby(['timestamp', 'file_path']))
             summary_text = f"""
+            **Analysis Summary for {model_name}:**
+            - Date Range: {date_range}
+            - Benchmark Runs: {benchmark_runs}
+            - Total Data Points: {len(filtered_df)}
+            **Selected Scenarios:**
+            {scenarios_list}
             """
         else:
             summary_text = "No data available for current selection."
         return perf_chart, gpu_chart, summary_table, summary_text
+    def update_historical_trends(self, selected_model: str, selected_scenarios: List[str],
                                 selected_gpus: List[str], start_date: str, end_date: str, metric: str):
         """Update historical trends chart with date filtering."""
         filtered_df = self.filter_data(
+            selected_model, selected_scenarios, selected_gpus,
             start_date=start_date, end_date=end_date
         )
         trend_chart = self.create_historical_trend_chart(filtered_df, metric)
 def create_gradio_interface() -> gr.Interface:
     """Create the Gradio interface."""
     dashboard = BenchmarkDashboard()
+    models, scenarios, gpus, benchmark_runs, default_scenarios, min_date, max_date = dashboard.get_filter_options()
+    # Performance metrics options (human-readable)
+    raw_metric_options = [
         "tokens_per_second_mean",
         "latency_seconds_mean",
         "time_to_first_token_seconds_mean",
         "time_per_output_token_seconds_mean"
     ]
+    metric_options = [dashboard.get_readable_metric_name(metric) for metric in raw_metric_options]
     with gr.Blocks(title="LLM Inference Performance Dashboard", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🚀 LLM Inference Performance Dashboard")
         gr.Markdown("Analyze and compare LLM inference performance across models, scenarios, and hardware configurations.")
+        gr.Markdown("*💡 **Smart Defaults**: The best performing scenario is automatically selected for each model based on throughput analysis.*")
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("## Filters")
+                model_filter = gr.Dropdown(
                     choices=models,
+                    value=models[0] if models else None,
+                    label="Select Model",
                     interactive=True
                 )
+                scenario_filter = gr.Dropdown(
                     choices=scenarios,
+                    value=[dashboard.get_best_scenario_for_model(models[0], "tokens_per_second_mean")] if models else [],
                     label="Select Scenarios",
+                    info="💡 The best performing scenario is automatically selected when you change models",
+                    multiselect=True,
                     interactive=True
                 )
                 gpu_filter = gr.CheckboxGroup(
                 )
                 metric_selector = gr.Dropdown(
                     choices=metric_options,
+                    value=dashboard.get_readable_metric_name("tokens_per_second_mean"),
                     label="Primary Metric",
                     interactive=True
                 )
             filtered_runs = [run for run in benchmark_runs if search_text.lower() in run.lower()]
             return gr.Dropdown(choices=filtered_runs, value=filtered_runs[0] if filtered_runs else None)
+        # Function to update scenarios when model changes
+        def update_scenarios_for_model(selected_model, current_metric):
+            if not selected_model:
+                return []
+            # Convert readable metric name back to raw name
+            raw_metric = dashboard.get_raw_metric_name(current_metric)
+            best_scenario = dashboard.get_best_scenario_for_model(selected_model, raw_metric)
+            return [best_scenario] if best_scenario else []
         # Update function for main dashboard (excluding historical trends)
+        def update_main(model_selected, scenarios_selected, gpus_selected, run_selected, metric):
+            # Convert readable metric name back to raw name
+            raw_metric = dashboard.get_raw_metric_name(metric)
             return dashboard.update_dashboard(
+                model_selected, scenarios_selected, gpus_selected, run_selected, raw_metric
             )
         # Update function for historical trends
+        def update_trends(model_selected, scenarios_selected, gpus_selected, start_dt, end_dt, metric):
+            # Convert readable metric name back to raw name
+            raw_metric = dashboard.get_raw_metric_name(metric)
             return dashboard.update_historical_trends(
+                model_selected, scenarios_selected, gpus_selected, start_dt, end_dt, raw_metric
             )
         # Set up interactivity for main dashboard
         # Connect search field to filter benchmark runs
         run_search.change(fn=filter_benchmark_runs, inputs=[run_search], outputs=[benchmark_run_selector])
+        # Auto-update scenarios when model changes
+        model_filter.change(
+            fn=update_scenarios_for_model,
+            inputs=[model_filter, metric_selector],
+            outputs=[scenario_filter]
+        )
         # Initial load
         demo.load(fn=update_main, inputs=main_inputs, outputs=main_outputs)
         demo.load(fn=update_trends, inputs=trends_inputs, outputs=trends_outputs)

scenario_mappings.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-  "eager_eager_attn": "Eager Execution + Eager Attention",
-  "eager_sdpa_default": "Eager Execution + SDPA Default",
-  "eager_sdpa_math": "Eager Execution + SDPA Math Backend",
-  "eager_sdpa_flash_attention": "Eager Execution + SDPA Flash Attention",
-  "eager_sdpa_efficient_attention": "Eager Execution + SDPA Efficient Attention",
-  "compiled_compile_max-autotune_eager_attn": "Compiled (Max-Autotune) + Eager Attention",
-  "compiled_compile_max-autotune_sdpa_default": "Compiled (Max-Autotune) + SDPA Default",
-  "compiled_compile_max-autotune_sdpa_math": "Compiled (Max-Autotune) + SDPA Math Backend",
-  "compiled_compile_max-autotune_sdpa_efficient_attention": "Compiled (Max-Autotune) + SDPA Efficient Attention"
 }

 {
+  "eager_sdpa_flash_attention": "Flash Attention",
+  "compiled_compile_max-autotune_sdpa_default": "Compiled + SDPA Default",
+  "eager_sdpa_default": "SDPA Default",
+  "eager_eager_attn": "Eager Attention",
+  "eager_sdpa_math": "SDPA Math Backend",
+  "eager_sdpa_efficient_attention": "Efficient Attention",
+  "compiled_compile_max-autotune_sdpa_efficient_attention": "Compiled + Efficient Attention",
+  "compiled_compile_max-autotune_eager_attn": "Compiled + Eager Attention",
+  "compiled_compile_max-autotune_sdpa_math": "Compiled + SDPA Math Backend"
 }