# %%
import os
import json
from huggingface_hub import snapshot_download
import pandas as pd
import matplotlib.cm as cm
from matplotlib.colors import to_hex
import plotly.graph_objects as go
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
# %%
def pull_results(results_dir: str):
snapshot_download(
repo_id="vectara/results",
repo_type="dataset",
local_dir=results_dir
)
def extract_info_from_result_file(result_file):
"""
{
"config": {
"model_dtype": "float16",
"model_name": "databricks/dbrx-instruct",
"model_sha": "main"
},
"results": {
"hallucination_rate": {
"hallucination_rate": 8.34990059642147
},
"factual_consistency_rate": {
"factual_consistency_rate": 91.65009940357854
},
"answer_rate": {
"answer_rate": 100.0
},
"average_summary_length": {
"average_summary_length": 85.9
}
}
"""
info = json.load(open(result_file, 'r'))
# Extract model_annotations with defaults for missing data
annotations = info.get("model_annotations", {})
model_size = annotations.get("model_size", "unknown")
accessibility = annotations.get("accessibility", "unknown")
result = {
"LLM": info["config"]["model_name"].rstrip("-"),
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
"Answer %": info["results"]["answer_rate"]["answer_rate"],
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
"Model Size": model_size,
"Accessibility": accessibility,
"category_results": info.get("category_results", {}),
"text_complexity_results": info.get("text_complexity_results", {}),
}
return result
def get_latest_result_file(dir: str):
"""
Get the latest result file in the given directory based on the timestamp in the file name.
"""
if not os.path.isdir(dir):
return None
files = os.listdir(dir)
files = [f for f in files if f.endswith(".json")]
if len(files) == 0:
return None
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
# Return the last file (most recent by mtime)
return os.path.join(dir, files[-1])
def scan_and_extract(dir: str):
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
"""
results = []
for root, dirs, files in os.walk(dir):
if len(dirs) == 0:
continue
for dir in dirs:
result_file = get_latest_result_file(os.path.join(root, dir))
if result_file is not None:
results.append(extract_info_from_result_file(result_file))
return results
def load_results(results_dir: str = "/tmp/hhem_results"):
"""Load results from HuggingFace dataset, processed entirely in memory."""
pull_results(results_dir)
print(f"Successfully pulled results from HuggingFace to {results_dir}")
results = scan_and_extract(results_dir)
if not results:
raise ValueError(f"No results found in {results_dir}")
print(f"Successfully extracted {len(results)} results")
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
results_df = results_df.replace("TBD", 100)
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
return results_df
# Mapping from dropdown display values to internal keys
DATA_SLICE_MAP = {
"Overall": ("overall", None),
"Low Complexity": ("complexity", "low_complexity_text"),
"High Complexity": ("complexity", "high_complexity_text"),
"Business": ("category", "business"),
"Education": ("category", "education"),
"Finance": ("category", "finance"),
"Law": ("category", "law"),
"Medicine": ("category", "medicine"),
"Politics": ("category", "politics"),
"Science": ("category", "science"),
"Sports": ("category", "sports"),
"Stocks": ("category", "stocks"),
"Technology": ("category", "technology"),
}
def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
"""Apply a data slice filter to recalculate metrics.
Args:
df: DataFrame with category_results and text_complexity_results columns
slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")
Returns:
DataFrame with recalculated metrics, sorted by Hallucination % ascending
"""
if slice_name not in DATA_SLICE_MAP:
return df
slice_type, slice_key = DATA_SLICE_MAP[slice_name]
if slice_type == "overall":
return df
result_df = df.copy()
rows_to_keep = []
for idx, row in result_df.iterrows():
if slice_type == "complexity":
data = row.get("text_complexity_results", {})
else: # category
data = row.get("category_results", {})
if not data or slice_key not in data:
continue
slice_data = data[slice_key]
if not slice_data:
continue
# Update metrics from slice data
result_df.at[idx, "Hallucination %"] = round(
slice_data.get("hallucination_rate", 0), 3
)
result_df.at[idx, "Answer %"] = round(
slice_data.get("answer_rate", 0), 3
)
result_df.at[idx, "Avg Summary Words"] = round(
slice_data.get("average_summary_length", 0), 3
)
rows_to_keep.append(idx)
# Filter to only rows with data for this slice
result_df = result_df.loc[rows_to_keep]
# Re-sort by hallucination rate
result_df = result_df.sort_values(by="Hallucination %", ascending=True)
return result_df
# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
# based on both hallucination percent and LLM name, determine font size
# if hallucination percentage is low and LLM name is long, use smaller font size
name_length = len(LLM)
if hallucination_percent < 0.25:
if name_length > 10:
return 8.5
else:
return 9
else:
return 9
def determine_font_color(hallucination_percent: float) -> str:
if 0.25 < hallucination_percent < 0.65:
return 'black'
else:
return 'white'
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
name_length = len(LLM)
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
hallu_rate_to_bar_length_ratio = 5
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
if name_length < bar_length:
return 0.01, determine_font_color(hallucination_percent)
else: # to the right of the bar, black anyway
return hallucination_percent, 'black'
def visualize_leaderboard(df: pd.DataFrame) -> go.Figure:
"""Create interactive horizontal bar chart with warning icons for low answer rate."""
plot_df = df.head(10).copy()
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(
plot_df[["Hallucination %"]]
)
plot_df = plot_df.iloc[::-1] # Reverse for bottom-to-top display
# Strip org prefix for labels
labels = [name.split("/")[-1] for name in plot_df["LLM"]]
# Calculate colors (RdYlGn_r) and patterns (hatched for low AR)
colors = []
patterns = []
for _, row in plot_df.iterrows():
colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"])))
patterns.append("/" if row["Answer %"] < 95 else "")
# Hover text with full details
hover_texts = [
f"{label}
"
f"Hallucination Rate: {row['Hallucination %']}%
"
f"Answer Rate: {row['Answer %']}%"
+ (" ✓" if row["Answer %"] >= 95 else " (below 95%)")
for label, (_, row) in zip(labels, plot_df.iterrows())
]
fig = go.Figure()
fig.add_trace(go.Bar(
y=labels,
x=plot_df["Hallucination %"],
orientation='h',
marker=dict(
color=colors,
pattern_shape=patterns,
pattern_fillmode="overlay",
line=dict(width=0)
),
text=[f"{val}%" for val in plot_df["Hallucination %"]],
textposition='outside',
textfont=dict(size=10, color='black'),
hovertemplate="%{customdata}",
customdata=hover_texts
))
# Title with copyright
title_text = (
f"Grounded Hallucination Rate of Best LLMs · "
f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}"
)
fig.update_layout(
title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'),
xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]),
yaxis=dict(title=""),
showlegend=False,
height=400,
margin=dict(l=180, r=50, t=50, b=40),
annotations=[
dict(
text="Striped = Answer Rate < 95%",
xref="paper", yref="paper", x=1.0, y=0.98,
showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top"
)
]
)
return fig
# %%
if __name__ == "__main__":
df = load_results()
print(df)
# %%