Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

leaderboard / app /app_utils.py

ofermend

updated

0e2da72 about 11 hours ago

raw

history blame contribute delete

9.72 kB

	# %%
	import os
	import json
	from huggingface_hub import snapshot_download
	import pandas as pd
	import matplotlib.cm as cm
	from matplotlib.colors import to_hex
	import plotly.graph_objects as go
	from datetime import datetime
	from sklearn.preprocessing import MinMaxScaler

	min_max_scaler = MinMaxScaler()

	# %%
	def pull_results(results_dir: str):
	snapshot_download(
	repo_id="vectara/results",
	repo_type="dataset",
	local_dir=results_dir
	)

	def extract_info_from_result_file(result_file):
	"""
	{
	"config": {
	"model_dtype": "float16",
	"model_name": "databricks/dbrx-instruct",
	"model_sha": "main"
	},
	"results": {
	"hallucination_rate": {
	"hallucination_rate": 8.34990059642147
	},
	"factual_consistency_rate": {
	"factual_consistency_rate": 91.65009940357854
	},
	"answer_rate": {
	"answer_rate": 100.0
	},
	"average_summary_length": {
	"average_summary_length": 85.9
	}
	}
	"""

	info = json.load(open(result_file, 'r'))

	# Extract model_annotations with defaults for missing data
	annotations = info.get("model_annotations", {})
	model_size = annotations.get("model_size", "unknown")
	accessibility = annotations.get("accessibility", "unknown")

	result = {
	"LLM": info["config"]["model_name"].rstrip("-"),
	"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
	"Answer %": info["results"]["answer_rate"]["answer_rate"],
	"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
	"Model Size": model_size,
	"Accessibility": accessibility,
	"category_results": info.get("category_results", {}),
	"text_complexity_results": info.get("text_complexity_results", {}),
	}
	return result

	def get_latest_result_file(dir: str):
	"""
	Get the latest result file in the given directory based on the timestamp in the file name.
	"""
	if not os.path.isdir(dir):
	return None
	files = os.listdir(dir)
	files = [f for f in files if f.endswith(".json")]
	if len(files) == 0:
	return None
	files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
	# Return the last file (most recent by mtime)
	return os.path.join(dir, files[-1])

	def scan_and_extract(dir: str):
	"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
	"""

	results = []
	for root, dirs, files in os.walk(dir):
	if len(dirs) == 0:
	continue
	for dir in dirs:
	result_file = get_latest_result_file(os.path.join(root, dir))
	if result_file is not None:
	results.append(extract_info_from_result_file(result_file))
	return results

	def load_results(results_dir: str = "/tmp/hhem_results"):
	"""Load results from HuggingFace dataset, processed entirely in memory."""
	pull_results(results_dir)
	print(f"Successfully pulled results from HuggingFace to {results_dir}")

	results = scan_and_extract(results_dir)
	if not results:
	raise ValueError(f"No results found in {results_dir}")

	print(f"Successfully extracted {len(results)} results")

	results_df = pd.DataFrame(results)
	results_df = results_df.sort_values(by="Hallucination %", ascending=True)
	results_df = results_df.replace("TBD", 100)

	for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
	results_df[column] = results_df[column].apply(lambda x: round(x, 3))

	results_df["LLM_lower_case"] = results_df["LLM"].str.lower()

	return results_df


	# Mapping from dropdown display values to internal keys
	DATA_SLICE_MAP = {
	"Overall": ("overall", None),
	"Low Complexity": ("complexity", "low_complexity_text"),
	"High Complexity": ("complexity", "high_complexity_text"),
	"Business": ("category", "business"),
	"Education": ("category", "education"),
	"Finance": ("category", "finance"),
	"Law": ("category", "law"),
	"Medicine": ("category", "medicine"),
	"Politics": ("category", "politics"),
	"Science": ("category", "science"),
	"Sports": ("category", "sports"),
	"Stocks": ("category", "stocks"),
	"Technology": ("category", "technology"),
	}


	def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
	"""Apply a data slice filter to recalculate metrics.

	Args:
	df: DataFrame with category_results and text_complexity_results columns
	slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")

	Returns:
	DataFrame with recalculated metrics, sorted by Hallucination % ascending
	"""
	if slice_name not in DATA_SLICE_MAP:
	return df

	slice_type, slice_key = DATA_SLICE_MAP[slice_name]

	if slice_type == "overall":
	return df

	result_df = df.copy()
	rows_to_keep = []

	for idx, row in result_df.iterrows():
	if slice_type == "complexity":
	data = row.get("text_complexity_results", {})
	else: # category
	data = row.get("category_results", {})

	if not data or slice_key not in data:
	continue

	slice_data = data[slice_key]
	if not slice_data:
	continue

	# Update metrics from slice data
	result_df.at[idx, "Hallucination %"] = round(
	slice_data.get("hallucination_rate", 0), 3
	)
	result_df.at[idx, "Answer %"] = round(
	slice_data.get("answer_rate", 0), 3
	)
	result_df.at[idx, "Avg Summary Words"] = round(
	slice_data.get("average_summary_length", 0), 3
	)
	rows_to_keep.append(idx)

	# Filter to only rows with data for this slice
	result_df = result_df.loc[rows_to_keep]

	# Re-sort by hallucination rate
	result_df = result_df.sort_values(by="Hallucination %", ascending=True)

	return result_df

	# %%
	def determine_font_size(LLM: str, hallucination_percent: float) -> int:
	# based on both hallucination percent and LLM name, determine font size
	# if hallucination percentage is low and LLM name is long, use smaller font size
	name_length = len(LLM)
	if hallucination_percent < 0.25:
	if name_length > 10:
	return 8.5
	else:
	return 9
	else:
	return 9

	def determine_font_color(hallucination_percent: float) -> str:
	if 0.25 < hallucination_percent < 0.65:
	return 'black'
	else:
	return 'white'

	def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
	name_length = len(LLM)
	print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)

	hallu_rate_to_bar_length_ratio = 5
	bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
	if name_length < bar_length:
	return 0.01, determine_font_color(hallucination_percent)
	else: # to the right of the bar, black anyway
	return hallucination_percent, 'black'

	def visualize_leaderboard(df: pd.DataFrame) -> go.Figure:
	"""Create interactive horizontal bar chart with warning icons for low answer rate."""
	plot_df = df.head(10).copy()
	plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(
	plot_df[["Hallucination %"]]
	)
	plot_df = plot_df.iloc[::-1] # Reverse for bottom-to-top display

	# Strip org prefix for labels
	labels = [name.split("/")[-1] for name in plot_df["LLM"]]

	# Calculate colors (RdYlGn_r) and patterns (hatched for low AR)
	colors = []
	patterns = []
	for _, row in plot_df.iterrows():
	colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"])))
	patterns.append("/" if row["Answer %"] < 95 else "")

	# Hover text with full details
	hover_texts = [
	f"<b>{label}</b><br>"
	f"Hallucination Rate: {row['Hallucination %']}%<br>"
	f"Answer Rate: {row['Answer %']}%"
	+ (" ✓" if row["Answer %"] >= 95 else " (below 95%)")
	for label, (_, row) in zip(labels, plot_df.iterrows())
	]

	fig = go.Figure()
	fig.add_trace(go.Bar(
	y=labels,
	x=plot_df["Hallucination %"],
	orientation='h',
	marker=dict(
	color=colors,
	pattern_shape=patterns,
	pattern_fillmode="overlay",
	line=dict(width=0)
	),
	text=[f"{val}%" for val in plot_df["Hallucination %"]],
	textposition='outside',
	textfont=dict(size=10, color='black'),
	hovertemplate="%{customdata}<extra></extra>",
	customdata=hover_texts
	))

	# Title with copyright
	title_text = (
	f"Grounded Hallucination Rate of Best LLMs · "
	f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}"
	)

	fig.update_layout(
	title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'),
	xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]),
	yaxis=dict(title=""),
	showlegend=False,
	height=400,
	margin=dict(l=180, r=50, t=50, b=40),
	annotations=[
	dict(
	text="Striped = Answer Rate < 95%",
	xref="paper", yref="paper", x=1.0, y=0.98,
	showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top"
	)
	]
	)
	return fig

	# %%

	if __name__ == "__main__":
	df = load_results()
	print(df)

	# %%