Evan Frick
commited on
Commit
·
4001fbf
1
Parent(s):
01f0e4f
app.py
CHANGED
|
@@ -43,13 +43,13 @@ def main():
|
|
| 43 |
# Iterate over each model in the selected benchmark
|
| 44 |
for model, metrics in benchmark_data.items():
|
| 45 |
|
|
|
|
|
|
|
| 46 |
model = path_split(path_splitext(model)[0])[-1]
|
| 47 |
# Flatten the metrics dictionary if there are nested metrics
|
| 48 |
# For example, in "human_preference_v1", there are subcategories like "overall", "hard_prompt", etc.
|
| 49 |
# We'll aggregate these or allow the user to select subcategories as needed
|
| 50 |
if isinstance(metrics, dict):
|
| 51 |
-
# Check if metrics contain nested dictionaries
|
| 52 |
-
nested_keys = list(metrics.keys())
|
| 53 |
# If there are nested keys, we can allow the user to select a subcategory
|
| 54 |
# For simplicity, let's assume we want to display all nested metrics concatenated
|
| 55 |
flattened_metrics = {}
|
|
@@ -63,12 +63,14 @@ def main():
|
|
| 63 |
flattened_metrics[subkey] = submetrics
|
| 64 |
records.append({
|
| 65 |
"Model": model,
|
|
|
|
| 66 |
**flattened_metrics
|
| 67 |
})
|
| 68 |
else:
|
| 69 |
# If metrics are not nested, just add them directly
|
| 70 |
records.append({
|
| 71 |
"Model": model,
|
|
|
|
| 72 |
"Value": metrics
|
| 73 |
})
|
| 74 |
|
|
@@ -79,23 +81,27 @@ def main():
|
|
| 79 |
df = df.loc[:, ~df.apply(contains_list)]
|
| 80 |
|
| 81 |
if "human" not in selected_benchmark:
|
| 82 |
-
df = df[sorted(df.columns, key=
|
| 83 |
|
| 84 |
# Set 'Model' as the index
|
| 85 |
-
df.set_index("Model", inplace=True)
|
| 86 |
|
| 87 |
|
| 88 |
# Create two columns: one for spacing and one for the search bar
|
| 89 |
-
col1, col2, col3 = st.columns([1,
|
| 90 |
with col1:
|
| 91 |
-
|
| 92 |
-
# st.markdown("#### Filter Columns")
|
| 93 |
column_search = st.text_input("", placeholder="Search metrics...", key="search")
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
if column_search:
|
| 97 |
# Filter columns that contain the search term (case-insensitive)
|
| 98 |
-
filtered_columns = [col for col in df.columns if column_search.lower() in col.lower()]
|
| 99 |
if filtered_columns:
|
| 100 |
df_display = df[filtered_columns]
|
| 101 |
else:
|
|
@@ -105,8 +111,19 @@ def main():
|
|
| 105 |
# If no search term, display all columns
|
| 106 |
df_display = df
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# Display the DataFrame
|
| 109 |
-
st.dataframe(df_display.sort_values(df_display.columns[
|
|
|
|
| 110 |
|
| 111 |
# Optional: Allow user to download the data as CSV
|
| 112 |
csv = df_display.to_csv()
|
|
|
|
| 43 |
# Iterate over each model in the selected benchmark
|
| 44 |
for model, metrics in benchmark_data.items():
|
| 45 |
|
| 46 |
+
model_type = "LLM Judge" if model.endswith(".jsonl") else "Reward Model"
|
| 47 |
+
|
| 48 |
model = path_split(path_splitext(model)[0])[-1]
|
| 49 |
# Flatten the metrics dictionary if there are nested metrics
|
| 50 |
# For example, in "human_preference_v1", there are subcategories like "overall", "hard_prompt", etc.
|
| 51 |
# We'll aggregate these or allow the user to select subcategories as needed
|
| 52 |
if isinstance(metrics, dict):
|
|
|
|
|
|
|
| 53 |
# If there are nested keys, we can allow the user to select a subcategory
|
| 54 |
# For simplicity, let's assume we want to display all nested metrics concatenated
|
| 55 |
flattened_metrics = {}
|
|
|
|
| 63 |
flattened_metrics[subkey] = submetrics
|
| 64 |
records.append({
|
| 65 |
"Model": model,
|
| 66 |
+
"Type": model_type,
|
| 67 |
**flattened_metrics
|
| 68 |
})
|
| 69 |
else:
|
| 70 |
# If metrics are not nested, just add them directly
|
| 71 |
records.append({
|
| 72 |
"Model": model,
|
| 73 |
+
"Type": model_type,
|
| 74 |
"Value": metrics
|
| 75 |
})
|
| 76 |
|
|
|
|
| 81 |
df = df.loc[:, ~df.apply(contains_list)]
|
| 82 |
|
| 83 |
if "human" not in selected_benchmark:
|
| 84 |
+
df = df[sorted(df.columns, key=lambda s: s.lower() if s != "Type" else "A")]
|
| 85 |
|
| 86 |
# Set 'Model' as the index
|
| 87 |
+
df.set_index(["Model"], inplace=True)
|
| 88 |
|
| 89 |
|
| 90 |
# Create two columns: one for spacing and one for the search bar
|
| 91 |
+
col1, col2, col3 = st.columns([1, 1, 2]) # Adjust the ratios as needed
|
| 92 |
with col1:
|
| 93 |
+
|
|
|
|
| 94 |
column_search = st.text_input("", placeholder="Search metrics...", key="search")
|
| 95 |
+
|
| 96 |
+
with col2:
|
| 97 |
+
|
| 98 |
+
model_search = st.text_input("", placeholder="Filter Models (separate criteria with ,) ...", key="search2")
|
| 99 |
+
|
| 100 |
+
model_search_crit = model_search.replace(", ", "|").replace(",", "|")
|
| 101 |
|
| 102 |
if column_search:
|
| 103 |
# Filter columns that contain the search term (case-insensitive)
|
| 104 |
+
filtered_columns = ["Type"] + [col for col in df.columns if column_search.lower() in col.lower()]
|
| 105 |
if filtered_columns:
|
| 106 |
df_display = df[filtered_columns]
|
| 107 |
else:
|
|
|
|
| 111 |
# If no search term, display all columns
|
| 112 |
df_display = df
|
| 113 |
|
| 114 |
+
if model_search:
|
| 115 |
+
|
| 116 |
+
df_display = df_display[df_display.index.str.contains(model_search_crit, case=False)]
|
| 117 |
+
|
| 118 |
+
if len(df_display) == 0:
|
| 119 |
+
st.warning("No models match your filter.")
|
| 120 |
+
df_display = pd.DataFrame() # Empty DataFrame
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
|
| 124 |
# Display the DataFrame
|
| 125 |
+
st.dataframe(df_display.sort_values(df_display.columns[1], ascending=False).style.background_gradient(cmap='summer_r', axis=0)
|
| 126 |
+
if len(df_display) else df_display, use_container_width=True, height=500)
|
| 127 |
|
| 128 |
# Optional: Allow user to download the data as CSV
|
| 129 |
csv = df_display.to_csv()
|