Improvements in performance chart, hopefully fixed online missing data
Browse files
app.py
CHANGED
|
@@ -702,47 +702,175 @@ def show_model_performance(df):
|
|
| 702 |
# Model comparison
|
| 703 |
st.subheader("Model Comparison")
|
| 704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
selected_models = st.multiselect(
|
| 706 |
-
"Select models to compare",
|
| 707 |
-
|
| 708 |
-
default=
|
|
|
|
| 709 |
)
|
| 710 |
|
| 711 |
if selected_models:
|
| 712 |
comparison_data = df_display.loc[selected_models].T
|
| 713 |
comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
|
| 714 |
|
| 715 |
-
# Radar
|
| 716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
fig = go.Figure()
|
| 718 |
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
|
| 724 |
-
# Close the radar chart
|
| 725 |
-
|
| 726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
fig.add_trace(go.Scatterpolar(
|
| 729 |
-
r=
|
| 730 |
-
theta=
|
| 731 |
fill='toself',
|
| 732 |
-
name=
|
|
|
|
|
|
|
| 733 |
))
|
| 734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
fig.update_layout(
|
| 736 |
polar=dict(
|
| 737 |
radialaxis=dict(
|
| 738 |
visible=True,
|
| 739 |
-
range=[
|
|
|
|
| 740 |
)),
|
| 741 |
showlegend=True,
|
| 742 |
-
title="Model Performance Radar Chart"
|
|
|
|
|
|
|
| 743 |
)
|
| 744 |
|
| 745 |
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
|
| 747 |
# Detailed comparison table
|
| 748 |
st.subheader("Detailed Comparison")
|
|
|
|
| 702 |
# Model comparison
|
| 703 |
st.subheader("Model Comparison")
|
| 704 |
|
| 705 |
+
# Benchmark selection for radar chart (always visible)
|
| 706 |
+
st.subheader("π Benchmark & Model Selection")
|
| 707 |
+
|
| 708 |
+
col1, col2 = st.columns([2, 1])
|
| 709 |
+
|
| 710 |
+
with col1:
|
| 711 |
+
available_benchmarks = list(df_display.columns)
|
| 712 |
+
default_benchmarks = available_benchmarks[:min(8, len(available_benchmarks))] # Default to first 8 or all if fewer
|
| 713 |
+
|
| 714 |
+
selected_benchmarks_for_radar = st.multiselect(
|
| 715 |
+
"Select benchmarks for radar chart",
|
| 716 |
+
available_benchmarks,
|
| 717 |
+
default=default_benchmarks,
|
| 718 |
+
format_func=clean_benchmark_name,
|
| 719 |
+
help="Choose which benchmarks to display in the radar chart"
|
| 720 |
+
)
|
| 721 |
+
|
| 722 |
+
with col2:
|
| 723 |
+
complete_data_only = st.checkbox(
|
| 724 |
+
"Complete data only",
|
| 725 |
+
value=True,
|
| 726 |
+
help="Show only models that have data for ALL selected benchmarks"
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
# Filter available models based on benchmark selection and complete data requirement
|
| 730 |
+
if complete_data_only and selected_benchmarks_for_radar:
|
| 731 |
+
# Only show models that have data for all selected benchmarks
|
| 732 |
+
models_with_complete_data = []
|
| 733 |
+
for model in df_display.index:
|
| 734 |
+
has_all_data = True
|
| 735 |
+
for benchmark in selected_benchmarks_for_radar:
|
| 736 |
+
if pd.isna(df_display.loc[model, benchmark]):
|
| 737 |
+
has_all_data = False
|
| 738 |
+
break
|
| 739 |
+
if has_all_data:
|
| 740 |
+
models_with_complete_data.append(model)
|
| 741 |
+
|
| 742 |
+
available_models_for_selection = models_with_complete_data
|
| 743 |
+
models_info = f"({len(available_models_for_selection)} models with complete data)"
|
| 744 |
+
else:
|
| 745 |
+
available_models_for_selection = df_display.index.tolist()
|
| 746 |
+
models_info = f"({len(available_models_for_selection)} models total)"
|
| 747 |
+
|
| 748 |
+
# Model selection with filtered list
|
| 749 |
+
if available_models_for_selection:
|
| 750 |
+
# Get top performers from available models for default selection
|
| 751 |
+
available_model_avg_scores = df_display.loc[available_models_for_selection].mean(axis=1, skipna=True).sort_values(ascending=False)
|
| 752 |
+
default_selection = available_model_avg_scores.head(3).index.tolist()
|
| 753 |
+
else:
|
| 754 |
+
default_selection = []
|
| 755 |
+
|
| 756 |
selected_models = st.multiselect(
|
| 757 |
+
f"Select models to compare {models_info}",
|
| 758 |
+
available_models_for_selection,
|
| 759 |
+
default=default_selection,
|
| 760 |
+
help="Models are filtered based on benchmark selection and complete data setting above"
|
| 761 |
)
|
| 762 |
|
| 763 |
if selected_models:
|
| 764 |
comparison_data = df_display.loc[selected_models].T
|
| 765 |
comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
|
| 766 |
|
| 767 |
+
# Performance Radar Chart
|
| 768 |
+
st.subheader("π Performance Radar Chart")
|
| 769 |
+
|
| 770 |
+
if not selected_benchmarks_for_radar:
|
| 771 |
+
st.info("Please select at least one benchmark above for the radar chart.")
|
| 772 |
+
elif len(selected_models) == 0:
|
| 773 |
+
st.info("Please select models above to see the radar chart comparison.")
|
| 774 |
+
elif len(selected_models) > 10:
|
| 775 |
+
st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
|
| 776 |
+
st.info("π‘ **Tip**: Use the search box above to filter models, then select a smaller subset for comparison.")
|
| 777 |
+
else:
|
| 778 |
+
# Show radar chart for 1-10 models
|
| 779 |
fig = go.Figure()
|
| 780 |
|
| 781 |
+
# Use only selected benchmarks
|
| 782 |
+
clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]
|
| 783 |
+
|
| 784 |
+
# Define colors for different models
|
| 785 |
+
colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
|
| 786 |
+
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
|
| 787 |
+
|
| 788 |
+
for i, model in enumerate(selected_models):
|
| 789 |
+
# Get model data for selected benchmarks only
|
| 790 |
+
model_scores = []
|
| 791 |
+
for benchmark in selected_benchmarks_for_radar:
|
| 792 |
+
score = df_display.loc[model, benchmark]
|
| 793 |
+
# Convert to float, use 0.0 for any remaining NaN values
|
| 794 |
+
model_scores.append(0.0 if pd.isna(score) else float(score))
|
| 795 |
|
| 796 |
+
# Close the radar chart by adding the first value at the end
|
| 797 |
+
radar_values = model_scores + [model_scores[0]]
|
| 798 |
+
radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]
|
| 799 |
+
|
| 800 |
+
# Create model name for legend (remove path prefix if present)
|
| 801 |
+
model_display_name = model.split('/')[-1] if '/' in model else model
|
| 802 |
+
|
| 803 |
+
# Use color from list, cycling if needed
|
| 804 |
+
model_color = colors_list[i % len(colors_list)]
|
| 805 |
|
| 806 |
fig.add_trace(go.Scatterpolar(
|
| 807 |
+
r=radar_values,
|
| 808 |
+
theta=radar_benchmarks,
|
| 809 |
fill='toself',
|
| 810 |
+
name=model_display_name,
|
| 811 |
+
line_color=model_color,
|
| 812 |
+
hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
|
| 813 |
))
|
| 814 |
|
| 815 |
+
# Calculate dynamic range for better visualization
|
| 816 |
+
all_values = []
|
| 817 |
+
for model in selected_models:
|
| 818 |
+
for benchmark in selected_benchmarks_for_radar:
|
| 819 |
+
score = df_display.loc[model, benchmark]
|
| 820 |
+
if not pd.isna(score):
|
| 821 |
+
all_values.append(score)
|
| 822 |
+
|
| 823 |
+
if all_values:
|
| 824 |
+
min_val = min(all_values)
|
| 825 |
+
max_val = max(all_values)
|
| 826 |
+
# Add some padding
|
| 827 |
+
range_padding = (max_val - min_val) * 0.1
|
| 828 |
+
radar_min = max(0, min_val - range_padding)
|
| 829 |
+
radar_max = min(1, max_val + range_padding)
|
| 830 |
+
else:
|
| 831 |
+
radar_min, radar_max = 0, 1
|
| 832 |
+
|
| 833 |
+
# Adjust chart size based on number of models
|
| 834 |
+
chart_height = 600 if len(selected_models) <= 3 else 700
|
| 835 |
+
|
| 836 |
fig.update_layout(
|
| 837 |
polar=dict(
|
| 838 |
radialaxis=dict(
|
| 839 |
visible=True,
|
| 840 |
+
range=[radar_min, radar_max],
|
| 841 |
+
tickformat='.2f'
|
| 842 |
)),
|
| 843 |
showlegend=True,
|
| 844 |
+
title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
|
| 845 |
+
width=700,
|
| 846 |
+
height=chart_height
|
| 847 |
)
|
| 848 |
|
| 849 |
st.plotly_chart(fig, use_container_width=True)
|
| 850 |
+
|
| 851 |
+
# Add explanation about missing values (only if not using complete data only)
|
| 852 |
+
if not complete_data_only:
|
| 853 |
+
missing_info = []
|
| 854 |
+
for model in selected_models:
|
| 855 |
+
missing_benchmarks = []
|
| 856 |
+
for benchmark in selected_benchmarks_for_radar:
|
| 857 |
+
if pd.isna(df_display.loc[model, benchmark]):
|
| 858 |
+
missing_benchmarks.append(clean_benchmark_name(benchmark))
|
| 859 |
+
if missing_benchmarks:
|
| 860 |
+
missing_info.append(f"β’ {model.split('/')[-1]}: {', '.join(missing_benchmarks)}")
|
| 861 |
+
|
| 862 |
+
if missing_info:
|
| 863 |
+
with st.expander("βΉοΈ Missing Data Information"):
|
| 864 |
+
st.write("Missing values are shown as 0 in the radar chart:")
|
| 865 |
+
for info in missing_info:
|
| 866 |
+
st.write(info)
|
| 867 |
+
else:
|
| 868 |
+
# When complete data only is enabled, all selected models should have complete data
|
| 869 |
+
st.info("β
All selected models have complete data for the chosen benchmarks.")
|
| 870 |
+
|
| 871 |
+
# Performance tips for large selections
|
| 872 |
+
if len(selected_models) > 5:
|
| 873 |
+
st.info(f"π‘ **Viewing {len(selected_models)} models**: For better readability, consider selecting fewer models or use the detailed comparison table below.")
|
| 874 |
|
| 875 |
# Detailed comparison table
|
| 876 |
st.subheader("Detailed Comparison")
|