Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

ofermend commited on about 6 hours ago

Commit

0e2da72

1 Parent(s): 7cd85bf

updated

Browse files

Files changed (2) hide show

app/app.py +22 -5
app/app_utils.py +75 -0

app/app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import pandas as pd
 import plotly.graph_objects as go
-from app_utils import load_results, visualize_leaderboard
 results_df = load_results()
@@ -20,11 +20,15 @@ def leaderboard(
     filter_models_by_name: str = "",
     high_ar_only: bool = False,
     size_filter: str = "all",
-    access_filter: str = "all"
 ):
     """Filter and display the leaderboard."""
     df = results_df.copy()
     # Filter by answer rate if toggle is on
     if high_ar_only:
         df = df[df["Answer %"] >= 95]
@@ -76,6 +80,11 @@ with gr.Blocks(
     }
     footer { display: none !important; }
     .modebar { display: none !important; }
     """
 ) as demo:
     gr.HTML(
@@ -101,12 +110,19 @@ with gr.Blocks(
             size_filter = gr.Radio(
                 choices=["all", "small", "large"],
                 value="all",
-                label="Model size"
             )
             access_filter = gr.Radio(
                 choices=["all", "commercial", "open"],
                 value="all",
-                label="Model type"
             )
     with gr.Row():
@@ -116,7 +132,7 @@ with gr.Blocks(
             max_height=500
         )
-    inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
     outputs = [plot_output, table_output]
     # Load initial data on page load
@@ -127,6 +143,7 @@ with gr.Blocks(
     high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
     size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
     access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
 if __name__ == "__main__":

 import pandas as pd
 import plotly.graph_objects as go
+from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP
 results_df = load_results()
     filter_models_by_name: str = "",
     high_ar_only: bool = False,
     size_filter: str = "all",
+    access_filter: str = "all",
+    data_slice: str = "Overall"
 ):
     """Filter and display the leaderboard."""
     df = results_df.copy()
+    # Apply data slice first (recalculates metrics and re-sorts)
+    df = apply_data_slice(df, data_slice)
     # Filter by answer rate if toggle is on
     if high_ar_only:
         df = df[df["Answer %"] >= 95]
     }
     footer { display: none !important; }
     .modebar { display: none !important; }
+    .horizontal-radio .wrap {
+        display: flex !important;
+        flex-direction: row !important;
+        gap: 8px !important;
+    }
     """
 ) as demo:
     gr.HTML(
             size_filter = gr.Radio(
                 choices=["all", "small", "large"],
                 value="all",
+                label="Model size",
+                elem_classes=["horizontal-radio"]
             )
             access_filter = gr.Radio(
                 choices=["all", "commercial", "open"],
                 value="all",
+                label="Model type",
+                elem_classes=["horizontal-radio"]
+            )
+            data_slice = gr.Dropdown(
+                choices=list(DATA_SLICE_MAP.keys()),
+                value="Overall",
+                label="Data Slice"
             )
     with gr.Row():
             max_height=500
         )
+    inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
     outputs = [plot_output, table_output]
     # Load initial data on page load
     high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
     size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
     access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
+    data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)
 if __name__ == "__main__":

app/app_utils.py CHANGED Viewed

@@ -57,6 +57,8 @@ def extract_info_from_result_file(result_file):
         "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
         "Model Size": model_size,
         "Accessibility": accessibility,
     }
     return result
@@ -110,6 +112,79 @@ def load_results(results_dir: str = "/tmp/hhem_results"):
     return results_df
 # %%
 def determine_font_size(LLM: str, hallucination_percent: float) -> int:
     # based on both hallucination percent and LLM name, determine font size

         "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
         "Model Size": model_size,
         "Accessibility": accessibility,
+        "category_results": info.get("category_results", {}),
+        "text_complexity_results": info.get("text_complexity_results", {}),
     }
     return result
     return results_df
+# Mapping from dropdown display values to internal keys
+DATA_SLICE_MAP = {
+    "Overall": ("overall", None),
+    "Low Complexity": ("complexity", "low_complexity_text"),
+    "High Complexity": ("complexity", "high_complexity_text"),
+    "Business": ("category", "business"),
+    "Education": ("category", "education"),
+    "Finance": ("category", "finance"),
+    "Law": ("category", "law"),
+    "Medicine": ("category", "medicine"),
+    "Politics": ("category", "politics"),
+    "Science": ("category", "science"),
+    "Sports": ("category", "sports"),
+    "Stocks": ("category", "stocks"),
+    "Technology": ("category", "technology"),
+}
+def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
+    """Apply a data slice filter to recalculate metrics.
+    Args:
+        df: DataFrame with category_results and text_complexity_results columns
+        slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")
+    Returns:
+        DataFrame with recalculated metrics, sorted by Hallucination % ascending
+    """
+    if slice_name not in DATA_SLICE_MAP:
+        return df
+    slice_type, slice_key = DATA_SLICE_MAP[slice_name]
+    if slice_type == "overall":
+        return df
+    result_df = df.copy()
+    rows_to_keep = []
+    for idx, row in result_df.iterrows():
+        if slice_type == "complexity":
+            data = row.get("text_complexity_results", {})
+        else:  # category
+            data = row.get("category_results", {})
+        if not data or slice_key not in data:
+            continue
+        slice_data = data[slice_key]
+        if not slice_data:
+            continue
+        # Update metrics from slice data
+        result_df.at[idx, "Hallucination %"] = round(
+            slice_data.get("hallucination_rate", 0), 3
+        )
+        result_df.at[idx, "Answer %"] = round(
+            slice_data.get("answer_rate", 0), 3
+        )
+        result_df.at[idx, "Avg Summary Words"] = round(
+            slice_data.get("average_summary_length", 0), 3
+        )
+        rows_to_keep.append(idx)
+    # Filter to only rows with data for this slice
+    result_df = result_df.loc[rows_to_keep]
+    # Re-sort by hallucination rate
+    result_df = result_df.sort_values(by="Hallucination %", ascending=True)
+    return result_df
 # %%
 def determine_font_size(LLM: str, hallucination_percent: float) -> int:
     # based on both hallucination percent and LLM name, determine font size