Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

ofermend commited on 3 days ago

Commit

d0c57df

1 Parent(s): 8bb0636

updated to gradio; python 3.11; visual improvements

Browse files

Files changed (6) hide show

Dockerfile +2 -3
app/app.py +114 -48
app/app_utils.py +60 -71
app/requirements.txt +3 -5
app/vectara_theme.py +0 -29
src/display/utils.py +1 -1

Dockerfile CHANGED Viewed

@@ -1,8 +1,7 @@
-FROM python:3.10
 WORKDIR /app
-COPY ./app/vectara_theme.py /app/vectara_theme.py
 COPY ./app/requirements.txt /app/requirements.txt
 COPY ./app/app.py /app/app.py
 COPY ./app/app_utils.py /app/app_utils.py
@@ -18,4 +17,4 @@ ENV HOME=/home/user \
 RUN mkdir -p /app/results
 RUN chown -R user /app
-CMD ["funix", "app.py", "--host", "0.0.0.0", "--port", "7860", "--no-browser"]

+FROM python:3.11
 WORKDIR /app
 COPY ./app/requirements.txt /app/requirements.txt
 COPY ./app/app.py /app/app.py
 COPY ./app/app_utils.py /app/app_utils.py
 RUN mkdir -p /app/results
 RUN chown -R user /app
+CMD ["python", "app.py"]

app/app.py CHANGED Viewed

@@ -1,62 +1,128 @@
-from typing import Callable, Literal, List, Tuple
-import json
 import pandas as pd
-import matplotlib.figure
-from IPython.display import Markdown
-from funix import funix, import_theme
-from vectara_theme import vectara_theme
-import_theme(vectara_theme)
 from app_utils import load_results, visualize_leaderboard
 results_df = load_results()
-@funix(
-    title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
-    direction="column",
-    autorun="always",
-    theme="vectara",
-    matplotlib_format="svg",
-    # output_layout=[
-    #     [{"return_index": 0, "width": 0.3}],
-    #     [{"return_index": 1, "width": 0.7}],
-    # ]
-)
-def leaderboard(
-    filter_models_by_name: str = ""
-    # filter_models_by_name: List[Literal["all", "anthropic", "google", "meta", "openai", "xai", "qwen"]] = ["all"]
-) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
-# ) -> Tuple[Markdown, pd.DataFrame]:
-    """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
-    Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model).    For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
-    **Usage:**
-    * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
-    * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
-    * To sort the table, hover over a column header and click the arrow. The arrow automatically points up and down depending on the sort order.
-    * Click the "Refresh" button to refresh the leaderboard if the table is not shown or does not update when you change the filter.
-    Args:
-        filter_models_by_name: filter models by name using comma-separated strings
-    """
-    df = results_df
     filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
-    if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name:
-        filter_models_by_name = filter_models_by_name.split(";")
-        # filter_models_by_name = [name for name in filter_models_by_name if name != "all"]
-        filter_models_by_name = [name for name in filter_models_by_name if name != ""]
-        df = df.copy()
-        df = df[df["LLM_lower_case"].str.contains("|".join(filter_models_by_name), na=False)]
-        if len(df) == 0: # return an empty DF and an empty figure
-            return Markdown(f"No models found matching: {filter_models_by_name}"), matplotlib.figure.Figure(), pd.DataFrame()
-    # return Markdown(""), df
     fig = visualize_leaderboard(df)
-    return Markdown(""), fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]

+import gradio as gr
 import pandas as pd
+import matplotlib.pyplot
 from app_utils import load_results, visualize_leaderboard
 results_df = load_results()
+DESCRIPTION = """
+# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
+Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
+this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
+For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
+For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
+"""
+def leaderboard(
+    filter_models_by_name: str = "",
+    high_ar_only: bool = False,
+    size_filter: str = "all",
+    access_filter: str = "all"
+):
+    """Filter and display the leaderboard."""
+    df = results_df.copy()
+    # Filter by answer rate if toggle is on
+    if high_ar_only:
+        df = df[df["Answer %"] >= 95]
+    # Filter by model size
+    if size_filter and size_filter != "all":
+        df = df[df["Model Size"] == size_filter]
+    # Filter by accessibility
+    if access_filter and access_filter != "all":
+        df = df[df["Accessibility"] == access_filter]
+    # Filter by model name
     filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
+    if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
+        filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
+        df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]
+    if len(df) == 0:
+        # Show "no results" message in the plot
+        fig, ax = matplotlib.pyplot.subplots(figsize=(10, 5))
+        ax.text(0.5, 0.5, "No models found matching your filter",
+                ha='center', va='center', fontsize=14, color='gray')
+        ax.set_xlim(0, 1)
+        ax.set_ylim(0, 1)
+        ax.axis('off')
+        return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
     fig = visualize_leaderboard(df)
+    return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
+with gr.Blocks(
+    title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
+    theme=gr.themes.Soft(),
+    css="""
+    .header-logo {
+        display: flex;
+        align-items: center;
+        gap: 10px;
+        margin-bottom: 10px;
+    }
+    .header-logo img {
+        height: 40px;
+    }
+    footer { display: none !important; }
+    """
+) as demo:
+    gr.HTML(
+        '<div class="header-logo">'
+        '<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
+        '</div>'
+    )
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=3):
+            plot_output = gr.Plot(show_label=False)
+        with gr.Column(scale=1):
+            filter_input = gr.Textbox(
+                placeholder="Filter models...",
+                show_label=False,
+                value=""
+            )
+            high_ar_toggle = gr.Checkbox(
+                label="Only models with ≥95% answer rate",
+                value=False
+            )
+            size_filter = gr.Radio(
+                choices=["all", "small", "large"],
+                value="all",
+                label="Model size"
+            )
+            access_filter = gr.Radio(
+                choices=["all", "commercial", "open"],
+                value="all",
+                label="Model type"
+            )
+    with gr.Row():
+        table_output = gr.Dataframe(
+            label="Leaderboard",
+            interactive=False,
+            max_height=500
+        )
+    inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
+    outputs = [plot_output, table_output]
+    # Load initial data on page load
+    demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)
+    # Update on filter change or toggle change
+    filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
+    high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
+    size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
+    access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

app/app_utils.py CHANGED Viewed

@@ -1,22 +1,23 @@
 # %%
-import os
 import json
-from huggingface_hub import Repository
 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.figure
 from datetime import datetime
 from sklearn.preprocessing import MinMaxScaler
-# import dotenv
-# dotenv.load_dotenv()
 min_max_scaler = MinMaxScaler()
 # %%
 def pull_results(results_dir: str):
-    repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset")
-    repo.git_pull()
 def extract_info_from_result_file(result_file):
     """
@@ -43,12 +44,19 @@ def extract_info_from_result_file(result_file):
     """
     info = json.load(open(result_file, 'r'))
     result = {
-        "LLM": info["config"]["model_name"],
         "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
-        # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
         "Answer %": info["results"]["answer_rate"]["answer_rate"],
         "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
     }
     return result
@@ -63,8 +71,8 @@ def get_latest_result_file(dir: str):
     if len(files) == 0:
         return None
     files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
-    # print ("Scanning: ", dir, "found latest file: ", files[0])
-    return os.path.join(dir, files[0])
 def scan_and_extract(dir: str):
     """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
@@ -80,43 +88,26 @@ def scan_and_extract(dir: str):
                 results.append(extract_info_from_result_file(result_file))
     return results
-def load_results(
-        results_dir: str = "./results",
-        results_json: str = "./results.json"
-        ):
-    try:
-        pull_results(results_dir)
-        print (f"Successfully pulled results from {results_dir}")
-    except Exception as e:
-        print(f"Failed to pull and/or extract latest results: {e}")
-    try:
-        results = scan_and_extract(results_dir)
-        if len(results) > 0:
-            with open(results_json, "w") as f:
-                json.dump(results, f, indent=2)
-            print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
-        else:
-            print(f"No results found in {results_dir}")
-    except Exception as e:
-        print(f"Failed to scan and extract results from {results_dir}: {e}")
-        print(f"Using pre-dumped results from {results_json}")
-    results = json.load(open(results_json, "r"))
-    # print(results)
     results_df = pd.DataFrame(results)
     results_df = results_df.sort_values(by="Hallucination %", ascending=True)
-    # replace any value TBD with -1
     results_df = results_df.replace("TBD", 100)
     for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
         results_df[column] = results_df[column].apply(lambda x: round(x, 3))
     results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
     return results_df
 # %%
@@ -150,53 +141,51 @@ def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: flo
         return hallucination_percent, 'black'
 def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
-    fig = plt.figure(figsize=(8, 4))
-    # plot using LLM as x-axis and Hallucination % as y-axis
-    # make bars horizontal
-    plot_df = df.head(10)
     plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
-    plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
-    # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
-    #     lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
-    #     axis=1
-    # ))
-    for i, row in plot_df.iterrows():
-        plt.text(
-            # row["LLM_x_position"],
-            row["Hallucination %"] + 0.025,
-            row["LLM"],
-            row["Hallucination %"],
-            # f"{row['LLM']}",
-            ha='left',
-            va='center',
-            fontsize=9,
-            # color=row["font_color"]
-        )
-    # plt.yticks([])
-    plt.tight_layout()
-    # add margin to the right of the plot
-    plt.subplots_adjust(right=0.95)
-    plt.xticks(fontsize=9)
-    plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
     plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
     plt.gca().spines['top'].set_visible(False)
     plt.gca().spines['right'].set_visible(False)
-    plt.gca().spines['left'].set_visible(False)
-    plt.gca().invert_yaxis()  # Invert the y-axis to display bars top-down
     return fig
 # %%
 if __name__ == "__main__":
-    results = scan_and_extract("./results")
-    with open("./results.json", "w") as f:
-        json.dump(results, f, indent=2)
 # %%

 # %%
+import os
 import json
+from huggingface_hub import snapshot_download
 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.figure
 from datetime import datetime
 from sklearn.preprocessing import MinMaxScaler
+import matplotlib.patheffects as pe
 min_max_scaler = MinMaxScaler()
 # %%
 def pull_results(results_dir: str):
+    snapshot_download(
+        repo_id="vectara/results",
+        repo_type="dataset",
+        local_dir=results_dir
+    )
 def extract_info_from_result_file(result_file):
     """
     """
     info = json.load(open(result_file, 'r'))
+    # Extract model_annotations with defaults for missing data
+    annotations = info.get("model_annotations", {})
+    model_size = annotations.get("model_size", "unknown")
+    accessibility = annotations.get("accessibility", "unknown")
     result = {
+        "LLM": info["config"]["model_name"].rstrip("-"),
         "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
         "Answer %": info["results"]["answer_rate"]["answer_rate"],
         "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
+        "Model Size": model_size,
+        "Accessibility": accessibility,
     }
     return result
     if len(files) == 0:
         return None
     files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
+    # Return the last file (most recent by mtime)
+    return os.path.join(dir, files[-1])
 def scan_and_extract(dir: str):
     """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
                 results.append(extract_info_from_result_file(result_file))
     return results
+def load_results(results_dir: str = "/tmp/hhem_results"):
+    """Load results from HuggingFace dataset, processed entirely in memory."""
+    pull_results(results_dir)
+    print(f"Successfully pulled results from HuggingFace to {results_dir}")
+    results = scan_and_extract(results_dir)
+    if not results:
+        raise ValueError(f"No results found in {results_dir}")
+    print(f"Successfully extracted {len(results)} results")
     results_df = pd.DataFrame(results)
     results_df = results_df.sort_values(by="Hallucination %", ascending=True)
     results_df = results_df.replace("TBD", 100)
     for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
         results_df[column] = results_df[column].apply(lambda x: round(x, 3))
     results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
     return results_df
 # %%
         return hallucination_percent, 'black'
 def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
+    fig = plt.figure(figsize=(10, 5))
+    plot_df = df.head(10).copy()
     plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
+    # Reverse order so lowest hallucination is at top
+    plot_df = plot_df.iloc[::-1]
+    y_positions = range(len(plot_df))
+    plt.barh(y_positions, plot_df["Hallucination %"], color=plt.cm.RdYlGn_r(plot_df["normalized_hallucination_rate"]))
+    # Add value labels to the right of bars and answer rate dots at bar end
+    for i, row in enumerate(plot_df.itertuples()):
+        plt.text(row._2 + 0.2, i, f"{row._2}%", ha='left', va='center', fontsize=8, fontweight='bold')
+        # Answer rate indicator - colored dot at end of bar
+        ar_dot_color = '#22aa22' if row._3 >= 95 else '#cc3333'
+        plt.scatter(row._2, i, color=ar_dot_color, s=25, zorder=5)
+    # Strip org prefix (e.g., "google/gemini-2.5" -> "gemini-2.5")
+    labels = [name.split("/")[-1] for name in plot_df["LLM"]]
+    plt.yticks(y_positions, labels, fontsize=8)
+    plt.xlabel("Hallucination Rate", fontsize=10)
     plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
     plt.gca().spines['top'].set_visible(False)
     plt.gca().spines['right'].set_visible(False)
+    # Add legend for answer rate dots
+    plt.scatter([], [], color='#22aa22', s=25, label='≥95%')
+    plt.scatter([], [], color='#cc3333', s=25, label='<95%')
+    plt.legend(loc='upper right', fontsize=8, framealpha=0.9, title='Answer Rate', title_fontsize=8)
+    plt.tight_layout()
+    plt.subplots_adjust(left=0.25, bottom=0.15)
+    # Add copyright at bottom
+    plt.figtext(0.5, 0.02, f"Copyright (2025) Vectara, Inc. - Plot generated on {datetime.now().strftime('%B %d, %Y')}",
+                ha='center', fontsize=10)
     return fig
 # %%
 if __name__ == "__main__":
+    df = load_results()
+    print(df)
 # %%

app/requirements.txt CHANGED Viewed

@@ -1,7 +1,5 @@
-funix==0.6.2
 pandas
-huggingface_hub==0.36.0
 matplotlib
-scikit-learn
-ipython
-git-lfs

+gradio>=4.0.0
 pandas
+huggingface_hub>=0.20.0
 matplotlib
+scikit-learn

app/vectara_theme.py DELETED Viewed

@@ -1,29 +0,0 @@
-vectara_theme = {
-    "name": "vectara",
-    "funix": {
-        "run_button": "Refresh",
-        "grid_height": 960,
-        "grid_checkbox": False
-    },
-    "overrides": {
-        "MuiAppBar": {
-            "styleOverrides": {
-                "root": {
-                    "backgroundColor": "#ffffff",
-                    "color": "rgba(0, 0, 0, 0.87)",
-                    "& .MuiToolbar-root:before": {
-                        "content": '""',
-                        "background": "url('https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png')",
-                        "display": "block",
-                        "background-size": "contain",
-                        "background-repeat": "no-repeat",
-                        "background-position": "left",
-                        "width": "125px",
-                        "height": "40px",
-                        "margin-right": "10px",
-                    },
-                },
-            }
-        },
-    },
-}

src/display/utils.py CHANGED Viewed

@@ -20,7 +20,7 @@ class ColumnContent:
     hidden: bool = False
     never_hidden: bool = False
     dummy: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init

     hidden: bool = False
     never_hidden: bool = False
     dummy: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init