Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 9,724 Bytes
8c3427d d0c57df 8c3427d d0c57df 8c3427d 7cd85bf 696341e 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 0e2da72 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d 81cb431 120684a 8c3427d 696341e d0c57df 8c3427d 0e2da72 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 7cd85bf d0c57df 7cd85bf 696341e 7cd85bf d0c57df 7cd85bf 8c3427d 7cd85bf 8c3427d d0c57df 8c3427d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
# %%
import os
import json
from huggingface_hub import snapshot_download
import pandas as pd
import matplotlib.cm as cm
from matplotlib.colors import to_hex
import plotly.graph_objects as go
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
# %%
def pull_results(results_dir: str):
snapshot_download(
repo_id="vectara/results",
repo_type="dataset",
local_dir=results_dir
)
def extract_info_from_result_file(result_file):
"""
{
"config": {
"model_dtype": "float16",
"model_name": "databricks/dbrx-instruct",
"model_sha": "main"
},
"results": {
"hallucination_rate": {
"hallucination_rate": 8.34990059642147
},
"factual_consistency_rate": {
"factual_consistency_rate": 91.65009940357854
},
"answer_rate": {
"answer_rate": 100.0
},
"average_summary_length": {
"average_summary_length": 85.9
}
}
"""
info = json.load(open(result_file, 'r'))
# Extract model_annotations with defaults for missing data
annotations = info.get("model_annotations", {})
model_size = annotations.get("model_size", "unknown")
accessibility = annotations.get("accessibility", "unknown")
result = {
"LLM": info["config"]["model_name"].rstrip("-"),
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
"Answer %": info["results"]["answer_rate"]["answer_rate"],
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
"Model Size": model_size,
"Accessibility": accessibility,
"category_results": info.get("category_results", {}),
"text_complexity_results": info.get("text_complexity_results", {}),
}
return result
def get_latest_result_file(dir: str):
"""
Get the latest result file in the given directory based on the timestamp in the file name.
"""
if not os.path.isdir(dir):
return None
files = os.listdir(dir)
files = [f for f in files if f.endswith(".json")]
if len(files) == 0:
return None
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
# Return the last file (most recent by mtime)
return os.path.join(dir, files[-1])
def scan_and_extract(dir: str):
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
"""
results = []
for root, dirs, files in os.walk(dir):
if len(dirs) == 0:
continue
for dir in dirs:
result_file = get_latest_result_file(os.path.join(root, dir))
if result_file is not None:
results.append(extract_info_from_result_file(result_file))
return results
def load_results(results_dir: str = "/tmp/hhem_results"):
"""Load results from HuggingFace dataset, processed entirely in memory."""
pull_results(results_dir)
print(f"Successfully pulled results from HuggingFace to {results_dir}")
results = scan_and_extract(results_dir)
if not results:
raise ValueError(f"No results found in {results_dir}")
print(f"Successfully extracted {len(results)} results")
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
results_df = results_df.replace("TBD", 100)
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
return results_df
# Mapping from dropdown display values to internal keys
DATA_SLICE_MAP = {
"Overall": ("overall", None),
"Low Complexity": ("complexity", "low_complexity_text"),
"High Complexity": ("complexity", "high_complexity_text"),
"Business": ("category", "business"),
"Education": ("category", "education"),
"Finance": ("category", "finance"),
"Law": ("category", "law"),
"Medicine": ("category", "medicine"),
"Politics": ("category", "politics"),
"Science": ("category", "science"),
"Sports": ("category", "sports"),
"Stocks": ("category", "stocks"),
"Technology": ("category", "technology"),
}
def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
"""Apply a data slice filter to recalculate metrics.
Args:
df: DataFrame with category_results and text_complexity_results columns
slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")
Returns:
DataFrame with recalculated metrics, sorted by Hallucination % ascending
"""
if slice_name not in DATA_SLICE_MAP:
return df
slice_type, slice_key = DATA_SLICE_MAP[slice_name]
if slice_type == "overall":
return df
result_df = df.copy()
rows_to_keep = []
for idx, row in result_df.iterrows():
if slice_type == "complexity":
data = row.get("text_complexity_results", {})
else: # category
data = row.get("category_results", {})
if not data or slice_key not in data:
continue
slice_data = data[slice_key]
if not slice_data:
continue
# Update metrics from slice data
result_df.at[idx, "Hallucination %"] = round(
slice_data.get("hallucination_rate", 0), 3
)
result_df.at[idx, "Answer %"] = round(
slice_data.get("answer_rate", 0), 3
)
result_df.at[idx, "Avg Summary Words"] = round(
slice_data.get("average_summary_length", 0), 3
)
rows_to_keep.append(idx)
# Filter to only rows with data for this slice
result_df = result_df.loc[rows_to_keep]
# Re-sort by hallucination rate
result_df = result_df.sort_values(by="Hallucination %", ascending=True)
return result_df
# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
# based on both hallucination percent and LLM name, determine font size
# if hallucination percentage is low and LLM name is long, use smaller font size
name_length = len(LLM)
if hallucination_percent < 0.25:
if name_length > 10:
return 8.5
else:
return 9
else:
return 9
def determine_font_color(hallucination_percent: float) -> str:
if 0.25 < hallucination_percent < 0.65:
return 'black'
else:
return 'white'
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
name_length = len(LLM)
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
hallu_rate_to_bar_length_ratio = 5
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
if name_length < bar_length:
return 0.01, determine_font_color(hallucination_percent)
else: # to the right of the bar, black anyway
return hallucination_percent, 'black'
def visualize_leaderboard(df: pd.DataFrame) -> go.Figure:
"""Create interactive horizontal bar chart with warning icons for low answer rate."""
plot_df = df.head(10).copy()
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(
plot_df[["Hallucination %"]]
)
plot_df = plot_df.iloc[::-1] # Reverse for bottom-to-top display
# Strip org prefix for labels
labels = [name.split("/")[-1] for name in plot_df["LLM"]]
# Calculate colors (RdYlGn_r) and patterns (hatched for low AR)
colors = []
patterns = []
for _, row in plot_df.iterrows():
colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"])))
patterns.append("/" if row["Answer %"] < 95 else "")
# Hover text with full details
hover_texts = [
f"<b>{label}</b><br>"
f"Hallucination Rate: {row['Hallucination %']}%<br>"
f"Answer Rate: {row['Answer %']}%"
+ (" ✓" if row["Answer %"] >= 95 else " (below 95%)")
for label, (_, row) in zip(labels, plot_df.iterrows())
]
fig = go.Figure()
fig.add_trace(go.Bar(
y=labels,
x=plot_df["Hallucination %"],
orientation='h',
marker=dict(
color=colors,
pattern_shape=patterns,
pattern_fillmode="overlay",
line=dict(width=0)
),
text=[f"{val}%" for val in plot_df["Hallucination %"]],
textposition='outside',
textfont=dict(size=10, color='black'),
hovertemplate="%{customdata}<extra></extra>",
customdata=hover_texts
))
# Title with copyright
title_text = (
f"Grounded Hallucination Rate of Best LLMs · "
f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}"
)
fig.update_layout(
title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'),
xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]),
yaxis=dict(title=""),
showlegend=False,
height=400,
margin=dict(l=180, r=50, t=50, b=40),
annotations=[
dict(
text="Striped = Answer Rate < 95%",
xref="paper", yref="paper", x=1.0, y=0.98,
showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top"
)
]
)
return fig
# %%
if __name__ == "__main__":
df = load_results()
print(df)
# %%
|