Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 6,711 Bytes
8c3427d d0c57df 8c3427d d0c57df 8c3427d 696341e 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d 81cb431 120684a 8c3427d 696341e d0c57df 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d 696341e 8c3427d d0c57df 8c3427d d0c57df 696341e d0c57df 8c3427d d0c57df 696341e d0c57df 696341e d0c57df 8c3427d d0c57df 8c3427d d0c57df 8c3427d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# %%
import os
import json
from huggingface_hub import snapshot_download
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import matplotlib.patheffects as pe
min_max_scaler = MinMaxScaler()
# %%
def pull_results(results_dir: str):
snapshot_download(
repo_id="vectara/results",
repo_type="dataset",
local_dir=results_dir
)
def extract_info_from_result_file(result_file):
"""
{
"config": {
"model_dtype": "float16",
"model_name": "databricks/dbrx-instruct",
"model_sha": "main"
},
"results": {
"hallucination_rate": {
"hallucination_rate": 8.34990059642147
},
"factual_consistency_rate": {
"factual_consistency_rate": 91.65009940357854
},
"answer_rate": {
"answer_rate": 100.0
},
"average_summary_length": {
"average_summary_length": 85.9
}
}
"""
info = json.load(open(result_file, 'r'))
# Extract model_annotations with defaults for missing data
annotations = info.get("model_annotations", {})
model_size = annotations.get("model_size", "unknown")
accessibility = annotations.get("accessibility", "unknown")
result = {
"LLM": info["config"]["model_name"].rstrip("-"),
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
"Answer %": info["results"]["answer_rate"]["answer_rate"],
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
"Model Size": model_size,
"Accessibility": accessibility,
}
return result
def get_latest_result_file(dir: str):
"""
Get the latest result file in the given directory based on the timestamp in the file name.
"""
if not os.path.isdir(dir):
return None
files = os.listdir(dir)
files = [f for f in files if f.endswith(".json")]
if len(files) == 0:
return None
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
# Return the last file (most recent by mtime)
return os.path.join(dir, files[-1])
def scan_and_extract(dir: str):
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
"""
results = []
for root, dirs, files in os.walk(dir):
if len(dirs) == 0:
continue
for dir in dirs:
result_file = get_latest_result_file(os.path.join(root, dir))
if result_file is not None:
results.append(extract_info_from_result_file(result_file))
return results
def load_results(results_dir: str = "/tmp/hhem_results"):
"""Load results from HuggingFace dataset, processed entirely in memory."""
pull_results(results_dir)
print(f"Successfully pulled results from HuggingFace to {results_dir}")
results = scan_and_extract(results_dir)
if not results:
raise ValueError(f"No results found in {results_dir}")
print(f"Successfully extracted {len(results)} results")
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Hallucination %", ascending=True)
results_df = results_df.replace("TBD", 100)
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
results_df[column] = results_df[column].apply(lambda x: round(x, 3))
results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
return results_df
# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
# based on both hallucination percent and LLM name, determine font size
# if hallucination percentage is low and LLM name is long, use smaller font size
name_length = len(LLM)
if hallucination_percent < 0.25:
if name_length > 10:
return 8.5
else:
return 9
else:
return 9
def determine_font_color(hallucination_percent: float) -> str:
if 0.25 < hallucination_percent < 0.65:
return 'black'
else:
return 'white'
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
name_length = len(LLM)
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)
hallu_rate_to_bar_length_ratio = 5
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
if name_length < bar_length:
return 0.01, determine_font_color(hallucination_percent)
else: # to the right of the bar, black anyway
return hallucination_percent, 'black'
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
fig = plt.figure(figsize=(10, 5))
plot_df = df.head(10).copy()
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
# Reverse order so lowest hallucination is at top
plot_df = plot_df.iloc[::-1]
y_positions = range(len(plot_df))
plt.barh(y_positions, plot_df["Hallucination %"], color=plt.cm.RdYlGn_r(plot_df["normalized_hallucination_rate"]))
# Add value labels to the right of bars and answer rate dots at bar end
for i, row in enumerate(plot_df.itertuples()):
plt.text(row._2 + 0.2, i, f"{row._2}%", ha='left', va='center', fontsize=8, fontweight='bold')
# Answer rate indicator - colored dot at end of bar
ar_dot_color = '#22aa22' if row._3 >= 95 else '#cc3333'
plt.scatter(row._2, i, color=ar_dot_color, s=25, zorder=5)
# Strip org prefix (e.g., "google/gemini-2.5" -> "gemini-2.5")
labels = [name.split("/")[-1] for name in plot_df["LLM"]]
plt.yticks(y_positions, labels, fontsize=8)
plt.xlabel("Hallucination Rate", fontsize=10)
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# Add legend for answer rate dots
plt.scatter([], [], color='#22aa22', s=25, label='≥95%')
plt.scatter([], [], color='#cc3333', s=25, label='<95%')
plt.legend(loc='upper right', fontsize=8, framealpha=0.9, title='Answer Rate', title_fontsize=8)
plt.tight_layout()
plt.subplots_adjust(left=0.25, bottom=0.15)
# Add copyright at bottom
plt.figtext(0.5, 0.02, f"Copyright (2025) Vectara, Inc. - Plot generated on {datetime.now().strftime('%B %d, %Y')}",
ha='center', fontsize=10)
return fig
# %%
if __name__ == "__main__":
df = load_results()
print(df)
# %%
|