Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| # %% | |
| import os | |
| import json | |
| from huggingface_hub import snapshot_download | |
| import pandas as pd | |
| import matplotlib.cm as cm | |
| from matplotlib.colors import to_hex | |
| import plotly.graph_objects as go | |
| from datetime import datetime | |
| from sklearn.preprocessing import MinMaxScaler | |
| min_max_scaler = MinMaxScaler() | |
| # %% | |
| def pull_results(results_dir: str): | |
| snapshot_download( | |
| repo_id="vectara/results", | |
| repo_type="dataset", | |
| local_dir=results_dir | |
| ) | |
| def extract_info_from_result_file(result_file): | |
| """ | |
| { | |
| "config": { | |
| "model_dtype": "float16", | |
| "model_name": "databricks/dbrx-instruct", | |
| "model_sha": "main" | |
| }, | |
| "results": { | |
| "hallucination_rate": { | |
| "hallucination_rate": 8.34990059642147 | |
| }, | |
| "factual_consistency_rate": { | |
| "factual_consistency_rate": 91.65009940357854 | |
| }, | |
| "answer_rate": { | |
| "answer_rate": 100.0 | |
| }, | |
| "average_summary_length": { | |
| "average_summary_length": 85.9 | |
| } | |
| } | |
| """ | |
| info = json.load(open(result_file, 'r')) | |
| # Extract model_annotations with defaults for missing data | |
| annotations = info.get("model_annotations", {}) | |
| model_size = annotations.get("model_size", "unknown") | |
| accessibility = annotations.get("accessibility", "unknown") | |
| result = { | |
| "LLM": info["config"]["model_name"].rstrip("-"), | |
| "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"], | |
| "Answer %": info["results"]["answer_rate"]["answer_rate"], | |
| "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"], | |
| "Model Size": model_size, | |
| "Accessibility": accessibility, | |
| "category_results": info.get("category_results", {}), | |
| "text_complexity_results": info.get("text_complexity_results", {}), | |
| } | |
| return result | |
| def get_latest_result_file(dir: str): | |
| """ | |
| Get the latest result file in the given directory based on the timestamp in the file name. | |
| """ | |
| if not os.path.isdir(dir): | |
| return None | |
| files = os.listdir(dir) | |
| files = [f for f in files if f.endswith(".json")] | |
| if len(files) == 0: | |
| return None | |
| files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x))) | |
| # Return the last file (most recent by mtime) | |
| return os.path.join(dir, files[-1]) | |
| def scan_and_extract(dir: str): | |
| """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one. | |
| """ | |
| results = [] | |
| for root, dirs, files in os.walk(dir): | |
| if len(dirs) == 0: | |
| continue | |
| for dir in dirs: | |
| result_file = get_latest_result_file(os.path.join(root, dir)) | |
| if result_file is not None: | |
| results.append(extract_info_from_result_file(result_file)) | |
| return results | |
| def load_results(results_dir: str = "/tmp/hhem_results"): | |
| """Load results from HuggingFace dataset, processed entirely in memory.""" | |
| pull_results(results_dir) | |
| print(f"Successfully pulled results from HuggingFace to {results_dir}") | |
| results = scan_and_extract(results_dir) | |
| if not results: | |
| raise ValueError(f"No results found in {results_dir}") | |
| print(f"Successfully extracted {len(results)} results") | |
| results_df = pd.DataFrame(results) | |
| results_df = results_df.sort_values(by="Hallucination %", ascending=True) | |
| results_df = results_df.replace("TBD", 100) | |
| for column in ["Hallucination %", "Answer %", "Avg Summary Words"]: | |
| results_df[column] = results_df[column].apply(lambda x: round(x, 3)) | |
| results_df["LLM_lower_case"] = results_df["LLM"].str.lower() | |
| return results_df | |
| # Mapping from dropdown display values to internal keys | |
| DATA_SLICE_MAP = { | |
| "Overall": ("overall", None), | |
| "Low Complexity": ("complexity", "low_complexity_text"), | |
| "High Complexity": ("complexity", "high_complexity_text"), | |
| "Business": ("category", "business"), | |
| "Education": ("category", "education"), | |
| "Finance": ("category", "finance"), | |
| "Law": ("category", "law"), | |
| "Medicine": ("category", "medicine"), | |
| "Politics": ("category", "politics"), | |
| "Science": ("category", "science"), | |
| "Sports": ("category", "sports"), | |
| "Stocks": ("category", "stocks"), | |
| "Technology": ("category", "technology"), | |
| } | |
| def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame: | |
| """Apply a data slice filter to recalculate metrics. | |
| Args: | |
| df: DataFrame with category_results and text_complexity_results columns | |
| slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business") | |
| Returns: | |
| DataFrame with recalculated metrics, sorted by Hallucination % ascending | |
| """ | |
| if slice_name not in DATA_SLICE_MAP: | |
| return df | |
| slice_type, slice_key = DATA_SLICE_MAP[slice_name] | |
| if slice_type == "overall": | |
| return df | |
| result_df = df.copy() | |
| rows_to_keep = [] | |
| for idx, row in result_df.iterrows(): | |
| if slice_type == "complexity": | |
| data = row.get("text_complexity_results", {}) | |
| else: # category | |
| data = row.get("category_results", {}) | |
| if not data or slice_key not in data: | |
| continue | |
| slice_data = data[slice_key] | |
| if not slice_data: | |
| continue | |
| # Update metrics from slice data | |
| result_df.at[idx, "Hallucination %"] = round( | |
| slice_data.get("hallucination_rate", 0), 3 | |
| ) | |
| result_df.at[idx, "Answer %"] = round( | |
| slice_data.get("answer_rate", 0), 3 | |
| ) | |
| result_df.at[idx, "Avg Summary Words"] = round( | |
| slice_data.get("average_summary_length", 0), 3 | |
| ) | |
| rows_to_keep.append(idx) | |
| # Filter to only rows with data for this slice | |
| result_df = result_df.loc[rows_to_keep] | |
| # Re-sort by hallucination rate | |
| result_df = result_df.sort_values(by="Hallucination %", ascending=True) | |
| return result_df | |
| # %% | |
| def determine_font_size(LLM: str, hallucination_percent: float) -> int: | |
| # based on both hallucination percent and LLM name, determine font size | |
| # if hallucination percentage is low and LLM name is long, use smaller font size | |
| name_length = len(LLM) | |
| if hallucination_percent < 0.25: | |
| if name_length > 10: | |
| return 8.5 | |
| else: | |
| return 9 | |
| else: | |
| return 9 | |
| def determine_font_color(hallucination_percent: float) -> str: | |
| if 0.25 < hallucination_percent < 0.65: | |
| return 'black' | |
| else: | |
| return 'white' | |
| def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float: | |
| name_length = len(LLM) | |
| print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length) | |
| hallu_rate_to_bar_length_ratio = 5 | |
| bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent | |
| if name_length < bar_length: | |
| return 0.01, determine_font_color(hallucination_percent) | |
| else: # to the right of the bar, black anyway | |
| return hallucination_percent, 'black' | |
| def visualize_leaderboard(df: pd.DataFrame) -> go.Figure: | |
| """Create interactive horizontal bar chart with warning icons for low answer rate.""" | |
| plot_df = df.head(10).copy() | |
| plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform( | |
| plot_df[["Hallucination %"]] | |
| ) | |
| plot_df = plot_df.iloc[::-1] # Reverse for bottom-to-top display | |
| # Strip org prefix for labels | |
| labels = [name.split("/")[-1] for name in plot_df["LLM"]] | |
| # Calculate colors (RdYlGn_r) and patterns (hatched for low AR) | |
| colors = [] | |
| patterns = [] | |
| for _, row in plot_df.iterrows(): | |
| colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"]))) | |
| patterns.append("/" if row["Answer %"] < 95 else "") | |
| # Hover text with full details | |
| hover_texts = [ | |
| f"<b>{label}</b><br>" | |
| f"Hallucination Rate: {row['Hallucination %']}%<br>" | |
| f"Answer Rate: {row['Answer %']}%" | |
| + (" ✓" if row["Answer %"] >= 95 else " (below 95%)") | |
| for label, (_, row) in zip(labels, plot_df.iterrows()) | |
| ] | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar( | |
| y=labels, | |
| x=plot_df["Hallucination %"], | |
| orientation='h', | |
| marker=dict( | |
| color=colors, | |
| pattern_shape=patterns, | |
| pattern_fillmode="overlay", | |
| line=dict(width=0) | |
| ), | |
| text=[f"{val}%" for val in plot_df["Hallucination %"]], | |
| textposition='outside', | |
| textfont=dict(size=10, color='black'), | |
| hovertemplate="%{customdata}<extra></extra>", | |
| customdata=hover_texts | |
| )) | |
| # Title with copyright | |
| title_text = ( | |
| f"Grounded Hallucination Rate of Best LLMs · " | |
| f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}" | |
| ) | |
| fig.update_layout( | |
| title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'), | |
| xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]), | |
| yaxis=dict(title=""), | |
| showlegend=False, | |
| height=400, | |
| margin=dict(l=180, r=50, t=50, b=40), | |
| annotations=[ | |
| dict( | |
| text="Striped = Answer Rate < 95%", | |
| xref="paper", yref="paper", x=1.0, y=0.98, | |
| showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top" | |
| ) | |
| ] | |
| ) | |
| return fig | |
| # %% | |
| if __name__ == "__main__": | |
| df = load_results() | |
| print(df) | |
| # %% | |