Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
updated
Browse files- app/app.py +22 -5
- app/app_utils.py +75 -0
app/app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
|
| 5 |
-
from app_utils import load_results, visualize_leaderboard
|
| 6 |
|
| 7 |
results_df = load_results()
|
| 8 |
|
|
@@ -20,11 +20,15 @@ def leaderboard(
|
|
| 20 |
filter_models_by_name: str = "",
|
| 21 |
high_ar_only: bool = False,
|
| 22 |
size_filter: str = "all",
|
| 23 |
-
access_filter: str = "all"
|
|
|
|
| 24 |
):
|
| 25 |
"""Filter and display the leaderboard."""
|
| 26 |
df = results_df.copy()
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
# Filter by answer rate if toggle is on
|
| 29 |
if high_ar_only:
|
| 30 |
df = df[df["Answer %"] >= 95]
|
|
@@ -76,6 +80,11 @@ with gr.Blocks(
|
|
| 76 |
}
|
| 77 |
footer { display: none !important; }
|
| 78 |
.modebar { display: none !important; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
) as demo:
|
| 81 |
gr.HTML(
|
|
@@ -101,12 +110,19 @@ with gr.Blocks(
|
|
| 101 |
size_filter = gr.Radio(
|
| 102 |
choices=["all", "small", "large"],
|
| 103 |
value="all",
|
| 104 |
-
label="Model size"
|
|
|
|
| 105 |
)
|
| 106 |
access_filter = gr.Radio(
|
| 107 |
choices=["all", "commercial", "open"],
|
| 108 |
value="all",
|
| 109 |
-
label="Model type"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
with gr.Row():
|
|
@@ -116,7 +132,7 @@ with gr.Blocks(
|
|
| 116 |
max_height=500
|
| 117 |
)
|
| 118 |
|
| 119 |
-
inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
|
| 120 |
outputs = [plot_output, table_output]
|
| 121 |
|
| 122 |
# Load initial data on page load
|
|
@@ -127,6 +143,7 @@ with gr.Blocks(
|
|
| 127 |
high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
| 128 |
size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
| 129 |
access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
if __name__ == "__main__":
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
|
| 5 |
+
from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP
|
| 6 |
|
| 7 |
results_df = load_results()
|
| 8 |
|
|
|
|
| 20 |
filter_models_by_name: str = "",
|
| 21 |
high_ar_only: bool = False,
|
| 22 |
size_filter: str = "all",
|
| 23 |
+
access_filter: str = "all",
|
| 24 |
+
data_slice: str = "Overall"
|
| 25 |
):
|
| 26 |
"""Filter and display the leaderboard."""
|
| 27 |
df = results_df.copy()
|
| 28 |
|
| 29 |
+
# Apply data slice first (recalculates metrics and re-sorts)
|
| 30 |
+
df = apply_data_slice(df, data_slice)
|
| 31 |
+
|
| 32 |
# Filter by answer rate if toggle is on
|
| 33 |
if high_ar_only:
|
| 34 |
df = df[df["Answer %"] >= 95]
|
|
|
|
| 80 |
}
|
| 81 |
footer { display: none !important; }
|
| 82 |
.modebar { display: none !important; }
|
| 83 |
+
.horizontal-radio .wrap {
|
| 84 |
+
display: flex !important;
|
| 85 |
+
flex-direction: row !important;
|
| 86 |
+
gap: 8px !important;
|
| 87 |
+
}
|
| 88 |
"""
|
| 89 |
) as demo:
|
| 90 |
gr.HTML(
|
|
|
|
| 110 |
size_filter = gr.Radio(
|
| 111 |
choices=["all", "small", "large"],
|
| 112 |
value="all",
|
| 113 |
+
label="Model size",
|
| 114 |
+
elem_classes=["horizontal-radio"]
|
| 115 |
)
|
| 116 |
access_filter = gr.Radio(
|
| 117 |
choices=["all", "commercial", "open"],
|
| 118 |
value="all",
|
| 119 |
+
label="Model type",
|
| 120 |
+
elem_classes=["horizontal-radio"]
|
| 121 |
+
)
|
| 122 |
+
data_slice = gr.Dropdown(
|
| 123 |
+
choices=list(DATA_SLICE_MAP.keys()),
|
| 124 |
+
value="Overall",
|
| 125 |
+
label="Data Slice"
|
| 126 |
)
|
| 127 |
|
| 128 |
with gr.Row():
|
|
|
|
| 132 |
max_height=500
|
| 133 |
)
|
| 134 |
|
| 135 |
+
inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
|
| 136 |
outputs = [plot_output, table_output]
|
| 137 |
|
| 138 |
# Load initial data on page load
|
|
|
|
| 143 |
high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
| 144 |
size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
| 145 |
access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
| 146 |
+
data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)
|
| 147 |
|
| 148 |
|
| 149 |
if __name__ == "__main__":
|
app/app_utils.py
CHANGED
|
@@ -57,6 +57,8 @@ def extract_info_from_result_file(result_file):
|
|
| 57 |
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
|
| 58 |
"Model Size": model_size,
|
| 59 |
"Accessibility": accessibility,
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
return result
|
| 62 |
|
|
@@ -110,6 +112,79 @@ def load_results(results_dir: str = "/tmp/hhem_results"):
|
|
| 110 |
|
| 111 |
return results_df
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# %%
|
| 114 |
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
|
| 115 |
# based on both hallucination percent and LLM name, determine font size
|
|
|
|
| 57 |
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
|
| 58 |
"Model Size": model_size,
|
| 59 |
"Accessibility": accessibility,
|
| 60 |
+
"category_results": info.get("category_results", {}),
|
| 61 |
+
"text_complexity_results": info.get("text_complexity_results", {}),
|
| 62 |
}
|
| 63 |
return result
|
| 64 |
|
|
|
|
| 112 |
|
| 113 |
return results_df
|
| 114 |
|
| 115 |
+
|
| 116 |
+
# Mapping from dropdown display values to internal keys
|
| 117 |
+
DATA_SLICE_MAP = {
|
| 118 |
+
"Overall": ("overall", None),
|
| 119 |
+
"Low Complexity": ("complexity", "low_complexity_text"),
|
| 120 |
+
"High Complexity": ("complexity", "high_complexity_text"),
|
| 121 |
+
"Business": ("category", "business"),
|
| 122 |
+
"Education": ("category", "education"),
|
| 123 |
+
"Finance": ("category", "finance"),
|
| 124 |
+
"Law": ("category", "law"),
|
| 125 |
+
"Medicine": ("category", "medicine"),
|
| 126 |
+
"Politics": ("category", "politics"),
|
| 127 |
+
"Science": ("category", "science"),
|
| 128 |
+
"Sports": ("category", "sports"),
|
| 129 |
+
"Stocks": ("category", "stocks"),
|
| 130 |
+
"Technology": ("category", "technology"),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
|
| 135 |
+
"""Apply a data slice filter to recalculate metrics.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
df: DataFrame with category_results and text_complexity_results columns
|
| 139 |
+
slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
DataFrame with recalculated metrics, sorted by Hallucination % ascending
|
| 143 |
+
"""
|
| 144 |
+
if slice_name not in DATA_SLICE_MAP:
|
| 145 |
+
return df
|
| 146 |
+
|
| 147 |
+
slice_type, slice_key = DATA_SLICE_MAP[slice_name]
|
| 148 |
+
|
| 149 |
+
if slice_type == "overall":
|
| 150 |
+
return df
|
| 151 |
+
|
| 152 |
+
result_df = df.copy()
|
| 153 |
+
rows_to_keep = []
|
| 154 |
+
|
| 155 |
+
for idx, row in result_df.iterrows():
|
| 156 |
+
if slice_type == "complexity":
|
| 157 |
+
data = row.get("text_complexity_results", {})
|
| 158 |
+
else: # category
|
| 159 |
+
data = row.get("category_results", {})
|
| 160 |
+
|
| 161 |
+
if not data or slice_key not in data:
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
slice_data = data[slice_key]
|
| 165 |
+
if not slice_data:
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
# Update metrics from slice data
|
| 169 |
+
result_df.at[idx, "Hallucination %"] = round(
|
| 170 |
+
slice_data.get("hallucination_rate", 0), 3
|
| 171 |
+
)
|
| 172 |
+
result_df.at[idx, "Answer %"] = round(
|
| 173 |
+
slice_data.get("answer_rate", 0), 3
|
| 174 |
+
)
|
| 175 |
+
result_df.at[idx, "Avg Summary Words"] = round(
|
| 176 |
+
slice_data.get("average_summary_length", 0), 3
|
| 177 |
+
)
|
| 178 |
+
rows_to_keep.append(idx)
|
| 179 |
+
|
| 180 |
+
# Filter to only rows with data for this slice
|
| 181 |
+
result_df = result_df.loc[rows_to_keep]
|
| 182 |
+
|
| 183 |
+
# Re-sort by hallucination rate
|
| 184 |
+
result_df = result_df.sort_values(by="Hallucination %", ascending=True)
|
| 185 |
+
|
| 186 |
+
return result_df
|
| 187 |
+
|
| 188 |
# %%
|
| 189 |
def determine_font_size(LLM: str, hallucination_percent: float) -> int:
|
| 190 |
# based on both hallucination percent and LLM name, determine font size
|