leaderboard / app /app_utils.py
ofermend's picture
updated to gradio; python 3.11; visual improvements
d0c57df
raw
history blame
6.71 kB
# %%
import os
import json
from huggingface_hub import snapshot_download
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import matplotlib.patheffects as pe
min_max_scaler = MinMaxScaler()
# %%
def pull_results(results_dir: str):
snapshot_download(
repo_id="vectara/results",
repo_type="dataset",
local_dir=results_dir
)
def extract_info_from_result_file(result_file):
"""
{
"config": {
"model_dtype": "float16",
"model_name": "databricks/dbrx-instruct",
"model_sha": "main"
},
"results": {
"hallucination_rate": {
"hallucination_rate": 8.34990059642147
},
"factual_consistency_rate": {
"factual_consistency_rate": 91.65009940357854
},
"answer_rate": {
"answer_rate": 100.0
},
"average_summary_length": {
"average_summary_length": 85.9
}
}
"""
info = json.load(open(result_file, 'r'))
# Extract model_annotations with defaults for missing data
annotations = info.get("model_annotations", {})
model_size = annotations.get("model_size", "unknown")
accessibility = annotations.get("accessibility", "unknown")
result = {
"LLM": info["config"]["model_name"].rstrip("-"),
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
"Answer %": info["results"]["answer_rate"]["answer_rate"],
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
"Model Size": model_size,
"Accessibility": accessibility,
}
return result
def get_latest_result_file(dir: str):
"""
Get the latest result file in the given directory based on the timestamp in the file name.
"""
if not os.path.isdir(dir):
return None
files = os.listdir(dir)
files = [f for f in files if f.endswith(".json")]
if len(files) == 0:
return None
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
# Return the last file (most recent by mtime)
return os.path.join(dir, files[-1])
def scan_and_extract(dir: str):
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
"""
results = []
for root, dirs, files in os.walk(dir):
if len(dirs) == 0:
continue
for dir in dirs:
result_file = get_latest_result_file(os.path.join(root, dir))
if result_file is not None:
results.append(extract_info_from_result_file(result_file))
return results
def load_results(results_dir: str = "/tmp/hhem_results"):
"""Load results from HuggingFace dataset, processed entirely in memory."""
pull_results(results_dir)
print(f"Successfully pulled results from HuggingFace to {results_dir}")
results = scan_and_extract(results_dir)
if not results:
raise ValueError(f"No results found in {results_dir}")
print(f"Successfully extracted {len(results)} results")
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
results_df = results_df.replace("TBD", 100)
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
return results_df
# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
# based on both hallucination percent and LLM name, determine font size
# if hallucination percentage is low and LLM name is long, use smaller font size
name_length = len(LLM)
if hallucination_percent < 0.25:
if name_length > 10:
return 8.5
else:
return 9
else:
return 9
def determine_font_color(hallucination_percent: float) -> str:
if 0.25 < hallucination_percent < 0.65:
return 'black'
else:
return 'white'
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
name_length = len(LLM)
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
hallu_rate_to_bar_length_ratio = 5
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
if name_length < bar_length:
return 0.01, determine_font_color(hallucination_percent)
else: # to the right of the bar, black anyway
return hallucination_percent, 'black'
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
fig = plt.figure(figsize=(10, 5))
plot_df = df.head(10).copy()
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
# Reverse order so lowest hallucination is at top
plot_df = plot_df.iloc[::-1]
y_positions = range(len(plot_df))
plt.barh(y_positions, plot_df["Hallucination %"], color=plt.cm.RdYlGn_r(plot_df["normalized_hallucination_rate"]))
# Add value labels to the right of bars and answer rate dots at bar end
for i, row in enumerate(plot_df.itertuples()):
plt.text(row._2 + 0.2, i, f"{row._2}%", ha='left', va='center', fontsize=8, fontweight='bold')
# Answer rate indicator - colored dot at end of bar
ar_dot_color = '#22aa22' if row._3 >= 95 else '#cc3333'
plt.scatter(row._2, i, color=ar_dot_color, s=25, zorder=5)
# Strip org prefix (e.g., "google/gemini-2.5" -> "gemini-2.5")
labels = [name.split("/")[-1] for name in plot_df["LLM"]]
plt.yticks(y_positions, labels, fontsize=8)
plt.xlabel("Hallucination Rate", fontsize=10)
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# Add legend for answer rate dots
plt.scatter([], [], color='#22aa22', s=25, label='≥95%')
plt.scatter([], [], color='#cc3333', s=25, label='<95%')
plt.legend(loc='upper right', fontsize=8, framealpha=0.9, title='Answer Rate', title_fontsize=8)
plt.tight_layout()
plt.subplots_adjust(left=0.25, bottom=0.15)
# Add copyright at bottom
plt.figtext(0.5, 0.02, f"Copyright (2025) Vectara, Inc. - Plot generated on {datetime.now().strftime('%B %d, %Y')}",
ha='center', fontsize=10)
return fig
# %%
if __name__ == "__main__":
df = load_results()
print(df)
# %%