Spaces:

lmarena-ai
/

preference-proxy-evaluations

Running

App Files Files Community

Evan Frick commited on Oct 11, 2024

Commit

631e505

1 Parent(s): 1c6662a

Add

Browse files

Files changed (1) hide show

app.py +121 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import streamlit as st
+import pandas as pd
+import pickle
+from os.path import split as path_split, splitext as path_splitext
+st.set_page_config(
+    page_title="PPE Metrics Explorer",
+    layout="wide",  # This makes the app use the entire screen width
+    initial_sidebar_state="expanded",
+)
+# Set the title of the app
+st.title("PPE Metrics Explorer")
+@st.cache_data
+def load_data(file_path):
+    """
+    Load pickle data from a file.
+    """
+    with open(file_path, 'r') as file:
+        data = pickle.load(file)
+    return data
+def contains_list(column):
+    return column.apply(lambda x: isinstance(x, list)).any()
+def main():
+    # Load the pickle data
+    data = load_data('results.pkl')
+    # Extract the list of benchmarks
+    benchmarks = list(data.keys())
+    # Dropdown for selecting benchmark
+    selected_benchmark = st.selectbox("Select a Benchmark", benchmarks)
+    # Extract data for the selected benchmark
+    benchmark_data = data[selected_benchmark]
+    # Prepare a list to store records
+    records = []
+    # Iterate over each model in the selected benchmark
+    for model, metrics in benchmark_data.items():
+        model = path_split(path_splitext(model)[0])[-1]
+        # Flatten the metrics dictionary if there are nested metrics
+        # For example, in "human_preference_v1", there are subcategories like "overall", "hard_prompt", etc.
+        # We'll aggregate these or allow the user to select subcategories as needed
+        if isinstance(metrics, dict):
+            # Check if metrics contain nested dictionaries
+            nested_keys = list(metrics.keys())
+            # If there are nested keys, we can allow the user to select a subcategory
+            # For simplicity, let's assume we want to display all nested metrics concatenated
+            flattened_metrics = {}
+            for subkey, submetrics in metrics.items():
+                if isinstance(submetrics, dict):
+                    for metric_name, value in submetrics.items():
+                        # Create a compound key
+                        key = f"{subkey} - {metric_name}"
+                        flattened_metrics[key] = value
+                else:
+                    flattened_metrics[subkey] = submetrics
+            records.append({
+                "Model": model,
+                **flattened_metrics
+            })
+        else:
+            # If metrics are not nested, just add them directly
+            records.append({
+                "Model": model,
+                "Value": metrics
+            })
+    # Create a DataFrame
+    df = pd.DataFrame(records)
+    # Drop columns that contain lists
+    df = df.loc[:, ~df.apply(contains_list)]
+    if "human" not in selected_benchmark:
+        df = df[sorted(df.columns, key=str.lower)]
+    # Set 'Model' as the index
+    df.set_index("Model", inplace=True)
+        # Create two columns: one for spacing and one for the search bar
+    col1, col2, col3 = st.columns([1, 3, 1])  # Adjust the ratios as needed
+    with col1:
+        # **Column Search Functionality**
+        # st.markdown("#### Filter Columns")
+        column_search = st.text_input("", placeholder="Search metrics...", key="search")
+    # column_search = st.text_input("Search for metrics (column names):", "")
+    if column_search:
+        # Filter columns that contain the search term (case-insensitive)
+        filtered_columns = [col for col in df.columns if column_search.lower() in col.lower()]
+        if filtered_columns:
+            df_display = df[filtered_columns]
+        else:
+            st.warning("No columns match your search.")
+            df_display = pd.DataFrame()  # Empty DataFrame
+    else:
+        # If no search term, display all columns
+        df_display = df
+    # Display the DataFrame
+    st.dataframe(df_display.sort_values(df_display.columns[0], ascending=False) if len(df_display) else df_display, use_container_width=True)
+    # Optional: Allow user to download the data as CSV
+    csv = df_display.to_csv()
+    st.download_button(
+        label="Download data as CSV",
+        data=csv,
+        file_name=f"{selected_benchmark}_metrics.csv",
+        mime='text/csv',
+    )
+if __name__ == "__main__":
+    main()