Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
updated visuals to better reflect answer rate
Browse files- app/app.py +12 -7
- app/app_utils.py +64 -37
- app/requirements.txt +2 -1
app/app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
-
import
|
| 4 |
|
| 5 |
from app_utils import load_results, visualize_leaderboard
|
| 6 |
|
|
@@ -45,12 +45,16 @@ def leaderboard(
|
|
| 45 |
|
| 46 |
if len(df) == 0:
|
| 47 |
# Show "no results" message in the plot
|
| 48 |
-
fig
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
|
| 55 |
|
| 56 |
fig = visualize_leaderboard(df)
|
|
@@ -71,6 +75,7 @@ with gr.Blocks(
|
|
| 71 |
height: 40px;
|
| 72 |
}
|
| 73 |
footer { display: none !important; }
|
|
|
|
| 74 |
"""
|
| 75 |
) as demo:
|
| 76 |
gr.HTML(
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
+
import plotly.graph_objects as go
|
| 4 |
|
| 5 |
from app_utils import load_results, visualize_leaderboard
|
| 6 |
|
|
|
|
| 45 |
|
| 46 |
if len(df) == 0:
|
| 47 |
# Show "no results" message in the plot
|
| 48 |
+
fig = go.Figure()
|
| 49 |
+
fig.add_annotation(
|
| 50 |
+
text="No models found matching your filter",
|
| 51 |
+
xref="paper", yref="paper", x=0.5, y=0.5,
|
| 52 |
+
showarrow=False, font=dict(size=14, color="gray")
|
| 53 |
+
)
|
| 54 |
+
fig.update_layout(
|
| 55 |
+
xaxis=dict(visible=False), yaxis=dict(visible=False),
|
| 56 |
+
height=400, margin=dict(l=50, r=50, t=50, b=50)
|
| 57 |
+
)
|
| 58 |
return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
|
| 59 |
|
| 60 |
fig = visualize_leaderboard(df)
|
|
|
|
| 75 |
height: 40px;
|
| 76 |
}
|
| 77 |
footer { display: none !important; }
|
| 78 |
+
.modebar { display: none !important; }
|
| 79 |
"""
|
| 80 |
) as demo:
|
| 81 |
gr.HTML(
|
app/app_utils.py
CHANGED
|
@@ -3,11 +3,11 @@ import os
|
|
| 3 |
import json
|
| 4 |
from huggingface_hub import snapshot_download
|
| 5 |
import pandas as pd
|
| 6 |
-
import matplotlib.
|
| 7 |
-
|
|
|
|
| 8 |
from datetime import datetime
|
| 9 |
from sklearn.preprocessing import MinMaxScaler
|
| 10 |
-
import matplotlib.patheffects as pe
|
| 11 |
|
| 12 |
min_max_scaler = MinMaxScaler()
|
| 13 |
|
|
@@ -140,45 +140,72 @@ def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: flo
|
|
| 140 |
else: # to the right of the bar, black anyway
|
| 141 |
return hallucination_percent, 'black'
|
| 142 |
|
| 143 |
-
def visualize_leaderboard(df: pd.DataFrame) ->
|
| 144 |
-
|
| 145 |
plot_df = df.head(10).copy()
|
| 146 |
-
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
plot_df = plot_df.iloc[::-1]
|
| 150 |
-
y_positions = range(len(plot_df))
|
| 151 |
-
|
| 152 |
-
plt.barh(y_positions, plot_df["Hallucination %"], color=plt.cm.RdYlGn_r(plot_df["normalized_hallucination_rate"]))
|
| 153 |
-
|
| 154 |
-
# Add value labels to the right of bars and answer rate dots at bar end
|
| 155 |
-
for i, row in enumerate(plot_df.itertuples()):
|
| 156 |
-
plt.text(row._2 + 0.2, i, f"{row._2}%", ha='left', va='center', fontsize=8, fontweight='bold')
|
| 157 |
-
# Answer rate indicator - colored dot at end of bar
|
| 158 |
-
ar_dot_color = '#22aa22' if row._3 >= 95 else '#cc3333'
|
| 159 |
-
plt.scatter(row._2, i, color=ar_dot_color, s=25, zorder=5)
|
| 160 |
|
| 161 |
-
# Strip org prefix
|
| 162 |
labels = [name.split("/")[-1] for name in plot_df["LLM"]]
|
| 163 |
-
plt.yticks(y_positions, labels, fontsize=8)
|
| 164 |
-
plt.xlabel("Hallucination Rate", fontsize=10)
|
| 165 |
-
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
|
| 166 |
-
|
| 167 |
-
plt.gca().spines['top'].set_visible(False)
|
| 168 |
-
plt.gca().spines['right'].set_visible(False)
|
| 169 |
|
| 170 |
-
#
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
return fig
|
| 183 |
|
| 184 |
# %%
|
|
|
|
| 3 |
import json
|
| 4 |
from huggingface_hub import snapshot_download
|
| 5 |
import pandas as pd
|
| 6 |
+
import matplotlib.cm as cm
|
| 7 |
+
from matplotlib.colors import to_hex
|
| 8 |
+
import plotly.graph_objects as go
|
| 9 |
from datetime import datetime
|
| 10 |
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
| 11 |
|
| 12 |
min_max_scaler = MinMaxScaler()
|
| 13 |
|
|
|
|
| 140 |
else: # to the right of the bar, black anyway
|
| 141 |
return hallucination_percent, 'black'
|
| 142 |
|
| 143 |
+
def visualize_leaderboard(df: pd.DataFrame) -> go.Figure:
|
| 144 |
+
"""Create interactive horizontal bar chart with warning icons for low answer rate."""
|
| 145 |
plot_df = df.head(10).copy()
|
| 146 |
+
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(
|
| 147 |
+
plot_df[["Hallucination %"]]
|
| 148 |
+
)
|
| 149 |
+
plot_df = plot_df.iloc[::-1] # Reverse for bottom-to-top display
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
# Strip org prefix for labels
|
| 152 |
labels = [name.split("/")[-1] for name in plot_df["LLM"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# Calculate colors (RdYlGn_r) and patterns (hatched for low AR)
|
| 155 |
+
colors = []
|
| 156 |
+
patterns = []
|
| 157 |
+
for _, row in plot_df.iterrows():
|
| 158 |
+
colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"])))
|
| 159 |
+
patterns.append("/" if row["Answer %"] < 95 else "")
|
| 160 |
+
|
| 161 |
+
# Hover text with full details
|
| 162 |
+
hover_texts = [
|
| 163 |
+
f"<b>{label}</b><br>"
|
| 164 |
+
f"Hallucination Rate: {row['Hallucination %']}%<br>"
|
| 165 |
+
f"Answer Rate: {row['Answer %']}%"
|
| 166 |
+
+ (" ✓" if row["Answer %"] >= 95 else " (below 95%)")
|
| 167 |
+
for label, (_, row) in zip(labels, plot_df.iterrows())
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
fig = go.Figure()
|
| 171 |
+
fig.add_trace(go.Bar(
|
| 172 |
+
y=labels,
|
| 173 |
+
x=plot_df["Hallucination %"],
|
| 174 |
+
orientation='h',
|
| 175 |
+
marker=dict(
|
| 176 |
+
color=colors,
|
| 177 |
+
pattern_shape=patterns,
|
| 178 |
+
pattern_fillmode="overlay",
|
| 179 |
+
line=dict(width=0)
|
| 180 |
+
),
|
| 181 |
+
text=[f"{val}%" for val in plot_df["Hallucination %"]],
|
| 182 |
+
textposition='outside',
|
| 183 |
+
textfont=dict(size=10, color='black'),
|
| 184 |
+
hovertemplate="%{customdata}<extra></extra>",
|
| 185 |
+
customdata=hover_texts
|
| 186 |
+
))
|
| 187 |
+
|
| 188 |
+
# Title with copyright
|
| 189 |
+
title_text = (
|
| 190 |
+
f"Grounded Hallucination Rate of Best LLMs · "
|
| 191 |
+
f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}"
|
| 192 |
+
)
|
| 193 |
|
| 194 |
+
fig.update_layout(
|
| 195 |
+
title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'),
|
| 196 |
+
xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]),
|
| 197 |
+
yaxis=dict(title=""),
|
| 198 |
+
showlegend=False,
|
| 199 |
+
height=400,
|
| 200 |
+
margin=dict(l=180, r=50, t=50, b=40),
|
| 201 |
+
annotations=[
|
| 202 |
+
dict(
|
| 203 |
+
text="Striped = Answer Rate < 95%",
|
| 204 |
+
xref="paper", yref="paper", x=1.0, y=0.98,
|
| 205 |
+
showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top"
|
| 206 |
+
)
|
| 207 |
+
]
|
| 208 |
+
)
|
| 209 |
return fig
|
| 210 |
|
| 211 |
# %%
|
app/requirements.txt
CHANGED
|
@@ -3,4 +3,5 @@ requests==2.32.5
|
|
| 3 |
pandas==2.2.3
|
| 4 |
huggingface_hub>=0.20.0
|
| 5 |
matplotlib==3.10.3
|
| 6 |
-
scikit-learn==1.6.1
|
|
|
|
|
|
| 3 |
pandas==2.2.3
|
| 4 |
huggingface_hub>=0.20.0
|
| 5 |
matplotlib==3.10.3
|
| 6 |
+
scikit-learn==1.6.1
|
| 7 |
+
plotly>=5.18.0
|