Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| # %% | |
| import os | |
| import json | |
| from huggingface_hub import snapshot_download | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import matplotlib.figure | |
| from datetime import datetime | |
| from sklearn.preprocessing import MinMaxScaler | |
| import matplotlib.patheffects as pe | |
| min_max_scaler = MinMaxScaler() | |
| # %% | |
| def pull_results(results_dir: str): | |
| snapshot_download( | |
| repo_id="vectara/results", | |
| repo_type="dataset", | |
| local_dir=results_dir | |
| ) | |
| def extract_info_from_result_file(result_file): | |
| """ | |
| { | |
| "config": { | |
| "model_dtype": "float16", | |
| "model_name": "databricks/dbrx-instruct", | |
| "model_sha": "main" | |
| }, | |
| "results": { | |
| "hallucination_rate": { | |
| "hallucination_rate": 8.34990059642147 | |
| }, | |
| "factual_consistency_rate": { | |
| "factual_consistency_rate": 91.65009940357854 | |
| }, | |
| "answer_rate": { | |
| "answer_rate": 100.0 | |
| }, | |
| "average_summary_length": { | |
| "average_summary_length": 85.9 | |
| } | |
| } | |
| """ | |
| info = json.load(open(result_file, 'r')) | |
| # Extract model_annotations with defaults for missing data | |
| annotations = info.get("model_annotations", {}) | |
| model_size = annotations.get("model_size", "unknown") | |
| accessibility = annotations.get("accessibility", "unknown") | |
| result = { | |
| "LLM": info["config"]["model_name"].rstrip("-"), | |
| "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"], | |
| "Answer %": info["results"]["answer_rate"]["answer_rate"], | |
| "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"], | |
| "Model Size": model_size, | |
| "Accessibility": accessibility, | |
| } | |
| return result | |
| def get_latest_result_file(dir: str): | |
| """ | |
| Get the latest result file in the given directory based on the timestamp in the file name. | |
| """ | |
| if not os.path.isdir(dir): | |
| return None | |
| files = os.listdir(dir) | |
| files = [f for f in files if f.endswith(".json")] | |
| if len(files) == 0: | |
| return None | |
| files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x))) | |
| # Return the last file (most recent by mtime) | |
| return os.path.join(dir, files[-1]) | |
| def scan_and_extract(dir: str): | |
| """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one. | |
| """ | |
| results = [] | |
| for root, dirs, files in os.walk(dir): | |
| if len(dirs) == 0: | |
| continue | |
| for dir in dirs: | |
| result_file = get_latest_result_file(os.path.join(root, dir)) | |
| if result_file is not None: | |
| results.append(extract_info_from_result_file(result_file)) | |
| return results | |
| def load_results(results_dir: str = "/tmp/hhem_results"): | |
| """Load results from HuggingFace dataset, processed entirely in memory.""" | |
| pull_results(results_dir) | |
| print(f"Successfully pulled results from HuggingFace to {results_dir}") | |
| results = scan_and_extract(results_dir) | |
| if not results: | |
| raise ValueError(f"No results found in {results_dir}") | |
| print(f"Successfully extracted {len(results)} results") | |
| results_df = pd.DataFrame(results) | |
| results_df = results_df.sort_values(by="Hallucination %", ascending=True) | |
| results_df = results_df.replace("TBD", 100) | |
| for column in ["Hallucination %", "Answer %", "Avg Summary Words"]: | |
| results_df[column] = results_df[column].apply(lambda x: round(x, 3)) | |
| results_df["LLM_lower_case"] = results_df["LLM"].str.lower() | |
| return results_df | |
| # %% | |
| def determine_font_size(LLM: str, hallucination_percent: float) -> int: | |
| # based on both hallucination percent and LLM name, determine font size | |
| # if hallucination percentage is low and LLM name is long, use smaller font size | |
| name_length = len(LLM) | |
| if hallucination_percent < 0.25: | |
| if name_length > 10: | |
| return 8.5 | |
| else: | |
| return 9 | |
| else: | |
| return 9 | |
| def determine_font_color(hallucination_percent: float) -> str: | |
| if 0.25 < hallucination_percent < 0.65: | |
| return 'black' | |
| else: | |
| return 'white' | |
| def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float: | |
| name_length = len(LLM) | |
| print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length) | |
| hallu_rate_to_bar_length_ratio = 5 | |
| bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent | |
| if name_length < bar_length: | |
| return 0.01, determine_font_color(hallucination_percent) | |
| else: # to the right of the bar, black anyway | |
| return hallucination_percent, 'black' | |
| def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure: | |
| fig = plt.figure(figsize=(10, 5)) | |
| plot_df = df.head(10).copy() | |
| plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]]) | |
| # Reverse order so lowest hallucination is at top | |
| plot_df = plot_df.iloc[::-1] | |
| y_positions = range(len(plot_df)) | |
| plt.barh(y_positions, plot_df["Hallucination %"], color=plt.cm.RdYlGn_r(plot_df["normalized_hallucination_rate"])) | |
| # Add value labels to the right of bars and answer rate dots at bar end | |
| for i, row in enumerate(plot_df.itertuples()): | |
| plt.text(row._2 + 0.2, i, f"{row._2}%", ha='left', va='center', fontsize=8, fontweight='bold') | |
| # Answer rate indicator - colored dot at end of bar | |
| ar_dot_color = '#22aa22' if row._3 >= 95 else '#cc3333' | |
| plt.scatter(row._2, i, color=ar_dot_color, s=25, zorder=5) | |
| # Strip org prefix (e.g., "google/gemini-2.5" -> "gemini-2.5") | |
| labels = [name.split("/")[-1] for name in plot_df["LLM"]] | |
| plt.yticks(y_positions, labels, fontsize=8) | |
| plt.xlabel("Hallucination Rate", fontsize=10) | |
| plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12) | |
| plt.gca().spines['top'].set_visible(False) | |
| plt.gca().spines['right'].set_visible(False) | |
| # Add legend for answer rate dots | |
| plt.scatter([], [], color='#22aa22', s=25, label='≥95%') | |
| plt.scatter([], [], color='#cc3333', s=25, label='<95%') | |
| plt.legend(loc='upper right', fontsize=8, framealpha=0.9, title='Answer Rate', title_fontsize=8) | |
| plt.tight_layout() | |
| plt.subplots_adjust(left=0.25, bottom=0.15) | |
| # Add copyright at bottom | |
| plt.figtext(0.5, 0.02, f"Copyright (2025) Vectara, Inc. - Plot generated on {datetime.now().strftime('%B %d, %Y')}", | |
| ha='center', fontsize=10) | |
| return fig | |
| # %% | |
| if __name__ == "__main__": | |
| df = load_results() | |
| print(df) | |
| # %% | |