Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

File size: 9,724 Bytes

# %%
import os
import json
from huggingface_hub import snapshot_download
import pandas as pd
import matplotlib.cm as cm
from matplotlib.colors import to_hex
import plotly.graph_objects as go
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

# %%
def pull_results(results_dir: str):
    snapshot_download(
        repo_id="vectara/results",
        repo_type="dataset",
        local_dir=results_dir
    )

def extract_info_from_result_file(result_file):
    """
        {
        "config": {
            "model_dtype": "float16",
            "model_name": "databricks/dbrx-instruct",
            "model_sha": "main"
        },
        "results": {
            "hallucination_rate": {
            "hallucination_rate": 8.34990059642147
            },
            "factual_consistency_rate": {
            "factual_consistency_rate": 91.65009940357854
            },
            "answer_rate": {
            "answer_rate": 100.0
            },
            "average_summary_length": {
            "average_summary_length": 85.9
            }
        }
    """

    info = json.load(open(result_file, 'r'))

    # Extract model_annotations with defaults for missing data
    annotations = info.get("model_annotations", {})
    model_size = annotations.get("model_size", "unknown")
    accessibility = annotations.get("accessibility", "unknown")

    result = {
        "LLM": info["config"]["model_name"].rstrip("-"),
        "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
        "Answer %": info["results"]["answer_rate"]["answer_rate"],
        "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
        "Model Size": model_size,
        "Accessibility": accessibility,
        "category_results": info.get("category_results", {}),
        "text_complexity_results": info.get("text_complexity_results", {}),
    }
    return result

def get_latest_result_file(dir: str):
    """
        Get the latest result file in the given directory based on the timestamp in the file name.
    """
    if not os.path.isdir(dir):
        return None
    files = os.listdir(dir)
    files = [f for f in files if f.endswith(".json")]
    if len(files) == 0:
        return None
    files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
    # Return the last file (most recent by mtime)
    return os.path.join(dir, files[-1])

def scan_and_extract(dir: str):
    """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
    """

    results = []
    for root, dirs, files in os.walk(dir):
        if len(dirs) == 0:
            continue
        for dir in dirs:
            result_file = get_latest_result_file(os.path.join(root, dir))
            if result_file is not None:
                results.append(extract_info_from_result_file(result_file))
    return results

def load_results(results_dir: str = "/tmp/hhem_results"):
    """Load results from HuggingFace dataset, processed entirely in memory."""
    pull_results(results_dir)
    print(f"Successfully pulled results from HuggingFace to {results_dir}")

    results = scan_and_extract(results_dir)
    if not results:
        raise ValueError(f"No results found in {results_dir}")

    print(f"Successfully extracted {len(results)} results")

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by="Hallucination %", ascending=True)
    results_df = results_df.replace("TBD", 100)

    for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
        results_df[column] = results_df[column].apply(lambda x: round(x, 3))

    results_df["LLM_lower_case"] = results_df["LLM"].str.lower()

    return results_df


# Mapping from dropdown display values to internal keys
DATA_SLICE_MAP = {
    "Overall": ("overall", None),
    "Low Complexity": ("complexity", "low_complexity_text"),
    "High Complexity": ("complexity", "high_complexity_text"),
    "Business": ("category", "business"),
    "Education": ("category", "education"),
    "Finance": ("category", "finance"),
    "Law": ("category", "law"),
    "Medicine": ("category", "medicine"),
    "Politics": ("category", "politics"),
    "Science": ("category", "science"),
    "Sports": ("category", "sports"),
    "Stocks": ("category", "stocks"),
    "Technology": ("category", "technology"),
}


def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
    """Apply a data slice filter to recalculate metrics.

    Args:
        df: DataFrame with category_results and text_complexity_results columns
        slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")

    Returns:
        DataFrame with recalculated metrics, sorted by Hallucination % ascending
    """
    if slice_name not in DATA_SLICE_MAP:
        return df

    slice_type, slice_key = DATA_SLICE_MAP[slice_name]

    if slice_type == "overall":
        return df

    result_df = df.copy()
    rows_to_keep = []

    for idx, row in result_df.iterrows():
        if slice_type == "complexity":
            data = row.get("text_complexity_results", {})
        else:  # category
            data = row.get("category_results", {})

        if not data or slice_key not in data:
            continue

        slice_data = data[slice_key]
        if not slice_data:
            continue

        # Update metrics from slice data
        result_df.at[idx, "Hallucination %"] = round(
            slice_data.get("hallucination_rate", 0), 3
        )
        result_df.at[idx, "Answer %"] = round(
            slice_data.get("answer_rate", 0), 3
        )
        result_df.at[idx, "Avg Summary Words"] = round(
            slice_data.get("average_summary_length", 0), 3
        )
        rows_to_keep.append(idx)

    # Filter to only rows with data for this slice
    result_df = result_df.loc[rows_to_keep]

    # Re-sort by hallucination rate
    result_df = result_df.sort_values(by="Hallucination %", ascending=True)

    return result_df

# %%
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
    # based on both hallucination percent and LLM name, determine font size
    # if hallucination percentage is low and LLM name is long, use smaller font size
    name_length = len(LLM)
    if hallucination_percent < 0.25:
        if name_length > 10:
            return 8.5
        else:
            return 9
    else:
        return 9
    
def determine_font_color(hallucination_percent: float) -> str:
    if 0.25 < hallucination_percent < 0.65:
        return 'black'
    else:
        return 'white'

def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
    name_length = len(LLM)
    print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)

    hallu_rate_to_bar_length_ratio = 5
    bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
    if name_length < bar_length:
        return 0.01, determine_font_color(hallucination_percent)
    else: # to the right of the bar, black anyway
        return hallucination_percent, 'black'

def visualize_leaderboard(df: pd.DataFrame) -> go.Figure:
    """Create interactive horizontal bar chart with warning icons for low answer rate."""
    plot_df = df.head(10).copy()
    plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(
        plot_df[["Hallucination %"]]
    )
    plot_df = plot_df.iloc[::-1]  # Reverse for bottom-to-top display

    # Strip org prefix for labels
    labels = [name.split("/")[-1] for name in plot_df["LLM"]]

    # Calculate colors (RdYlGn_r) and patterns (hatched for low AR)
    colors = []
    patterns = []
    for _, row in plot_df.iterrows():
        colors.append(to_hex(cm.RdYlGn_r(row["normalized_hallucination_rate"])))
        patterns.append("/" if row["Answer %"] < 95 else "")

    # Hover text with full details
    hover_texts = [
        f"<b>{label}</b><br>"
        f"Hallucination Rate: {row['Hallucination %']}%<br>"
        f"Answer Rate: {row['Answer %']}%"
        + (" ✓" if row["Answer %"] >= 95 else " (below 95%)")
        for label, (_, row) in zip(labels, plot_df.iterrows())
    ]

    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=labels,
        x=plot_df["Hallucination %"],
        orientation='h',
        marker=dict(
            color=colors,
            pattern_shape=patterns,
            pattern_fillmode="overlay",
            line=dict(width=0)
        ),
        text=[f"{val}%" for val in plot_df["Hallucination %"]],
        textposition='outside',
        textfont=dict(size=10, color='black'),
        hovertemplate="%{customdata}<extra></extra>",
        customdata=hover_texts
    ))

    # Title with copyright
    title_text = (
        f"Grounded Hallucination Rate of Best LLMs · "
        f"© {datetime.now().year} Vectara · Created {datetime.now().strftime('%B %d, %Y')}"
    )

    fig.update_layout(
        title=dict(text=title_text, font=dict(size=13), x=0.5, xanchor='center'),
        xaxis=dict(title="Hallucination Rate", range=[0, max(plot_df["Hallucination %"]) * 1.15]),
        yaxis=dict(title=""),
        showlegend=False,
        height=400,
        margin=dict(l=180, r=50, t=50, b=40),
        annotations=[
            dict(
                text="Striped = Answer Rate < 95%",
                xref="paper", yref="paper", x=1.0, y=0.98,
                showarrow=False, font=dict(size=10, color="gray"), xanchor="right", yanchor="top"
            )
        ]
    )
    return fig

# %%

if __name__ == "__main__":
    df = load_results()
    print(df)

# %%