# %% import os import json from huggingface_hub import snapshot_download import pandas as pd import matplotlib.cm as cm from matplotlib.colors import to_hex import plotly.graph_objects as go from datetime import datetime from sklearn.preprocessing import MinMaxScaler min_max_scaler = MinMaxScaler() # %% def pull_results(results_dir: str): snapshot_download( repo_id="vectara/results", repo_type="dataset", local_dir=results_dir ) def extract_info_from_result_file(result_file): """ { "config": { "model_dtype": "float16", "model_name": "databricks/dbrx-instruct", "model_sha": "main" }, "results": { "hallucination_rate": { "hallucination_rate": 8.34990059642147 }, "factual_consistency_rate": { "factual_consistency_rate": 91.65009940357854 }, "answer_rate": { "answer_rate": 100.0 }, "average_summary_length": { "average_summary_length": 85.9 } } """ info = json.load(open(result_file, 'r')) # Extract model_annotations with defaults for missing data annotations = info.get("model_annotations", {}) model_size = annotations.get("model_size", "unknown") accessibility = annotations.get("accessibility", "unknown") result = { "LLM": info["config"]["model_name"].rstrip("-"), "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"], "Answer %": info["results"]["answer_rate"]["answer_rate"], "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"], "Model Size": model_size, "Accessibility": accessibility, "category_results": info.get("category_results", {}), "text_complexity_results": info.get("text_complexity_results", {}), } return result def get_latest_result_file(dir: str): """ Get the latest result file in the given directory based on the timestamp in the file name. """ if not os.path.isdir(dir): return None files = os.listdir(dir) files = [f for f in files if f.endswith(".json")] if len(files) == 0: return None files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x))) # Return the last file (most recent by mtime) return os.path.join(dir, files[-1]) def scan_and_extract(dir: str): """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one. """ results = [] for root, dirs, files in os.walk(dir): if len(dirs) == 0: continue for dir in dirs: result_file = get_latest_result_file(os.path.join(root, dir)) if result_file is not None: results.append(extract_info_from_result_file(result_file)) return results def load_results(results_dir: str = "/tmp/hhem_results"): """Load results from HuggingFace dataset, processed entirely in memory.""" pull_results(results_dir) print(f"Successfully pulled results from HuggingFace to {results_dir}") results = scan_and_extract(results_dir) if not results: raise ValueError(f"No results found in {results_dir}") print(f"Successfully extracted {len(results)} results") results_df = pd.DataFrame(results) results_df = results_df.sort_values(by="Hallucination %", ascending=True) results_df = results_df.replace("TBD", 100) for column in ["Hallucination %", "Answer %", "Avg Summary Words"]: results_df[column] = results_df[column].apply(lambda x: round(x, 3)) results_df["LLM_lower_case"] = results_df["LLM"].str.lower() return results_df # Mapping from dropdown display values to internal keys DATA_SLICE_MAP = { "Overall": ("overall", None), "Low Complexity": ("complexity", "low_complexity_text"), "High Complexity": ("complexity", "high_complexity_text"), "Business": ("category", "business"), "Education": ("category", "education"), "Finance": ("category", "finance"), "Law": ("category", "law"), "Medicine": ("category", "medicine"), "Politics": ("category", "politics"), "Science": ("category", "science"), "Sports": ("category", "sports"), "Stocks": ("category", "stocks"), "Technology": ("category", "technology"), } def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame: """Apply a data slice filter to recalculate metrics. Args: df: DataFrame with category_results and text_complexity_results columns slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business") Returns: DataFrame with recalculated metrics, sorted by Hallucination % ascending """ if slice_name not in DATA_SLICE_MAP: return df slice_type, slice_key = DATA_SLICE_MAP[slice_name] if slice_type == "overall": return df result_df = df.copy() rows_to_keep = [] for idx, row in result_df.iterrows(): if slice_type == "complexity": data = row.get("text_complexity_results", {}) else: # category data = row.get("category_results", {}) if not data or slice_key not in data: continue slice_data = data[slice_key] if not slice_data: continue # Update metrics from slice data result_df.at[idx, "Hallucination %"] = round( slice_data.get("hallucination_rate", 0), 3 ) result_df.at[idx, "Answer %"] = round( slice_data.get("answer_rate", 0), 3 ) result_df.at[idx, "Avg Summary Words"] = round( slice_data.get("average_summary_length", 0), 3 ) rows_to_keep.append(idx) # Filter to only rows with data for this slice result_df = result_df.loc[rows_to_keep] # Re-sort by hallucination rate result_df = result_df.sort_values(by="Hallucination %", ascending=True) return result_df # %% def determine_font_size(LLM: str, hallucination_percent: float) -> int: # based on both hallucination percent and LLM name, determine font size # if hallucination percentage is low and LLM name is long, use smaller font size name_length = len(LLM) if hallucination_percent < 0.25: if name_length > 10: return 8.5 else: return 9 else: return 9 def determine_font_color(hallucination_percent: float) -> str: if 0.25 < hallucination_percent < 0.65: return 'black' else: return 'white' def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float: name_length = len(LLM) print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length) hallu_rate_to_bar_length_ratio = 5 bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent if name_length < bar_length: return 0.01, determine_font_color(hallucination_percent) else: # to the right of the bar, black anyway return hallucination_percent, 'black' def visualize_leaderboard(df: pd.DataFrame) -> go.Figure: """Create interactive horizontal bar chart with warning icons for low answer rate.""" plot_df = df.head(10).copy() plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform( plot_df[["Hallucination %"]] ) plot_df = plot_df.iloc[::-1] # Reverse for bottom-to-top display # Strip org prefix for labels labels = [name.split("/")[-1] for name in plot_df["LLM"]] # Calculate colors (RdYlGn_r) and patterns (hatched for low AR) colors = [] patterns = [] for _, row in plot_df.iterrows(): colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"]))) patterns.append("/" if row["Answer %"] < 95 else "") # Hover text with full details hover_texts = [ f"{label}
" f"Hallucination Rate: {row['Hallucination %']}%
" f"Answer Rate: {row['Answer %']}%" + (" ✓" if row["Answer %"] >= 95 else " (below 95%)") for label, (_, row) in zip(labels, plot_df.iterrows()) ] fig = go.Figure() fig.add_trace(go.Bar( y=labels, x=plot_df["Hallucination %"], orientation='h', marker=dict( color=colors, pattern_shape=patterns, pattern_fillmode="overlay", line=dict(width=0) ), text=[f"{val}%" for val in plot_df["Hallucination %"]], textposition='outside', textfont=dict(size=10, color='black'), hovertemplate="%{customdata}", customdata=hover_texts )) # Title with copyright title_text = ( f"Grounded Hallucination Rate of Best LLMs · " f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}" ) fig.update_layout( title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'), xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]), yaxis=dict(title=""), showlegend=False, height=400, margin=dict(l=180, r=50, t=50, b=40), annotations=[ dict( text="Striped = Answer Rate < 95%", xref="paper", yref="paper", x=1.0, y=0.98, showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top" ) ] ) return fig # %% if __name__ == "__main__": df = load_results() print(df) # %%