Spaces:

PropensityLabs
/

LLM-Propensity-Evals

Sleeping

App Files Files Community

foo-barrr commited on Sep 11

Commit

cf68414

1 Parent(s): 23b13b0

Simple Gradio app to display models and scores from a Google sheet

Browse files

Files changed (2) hide show

app.py +73 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import gradio as gr
+import pandas as pd
+import requests
+from io import StringIO
+# Load data from Google Sheets
+def load_data_from_google_sheets():
+    SHEET_ID = "19HEHUtljTu1jaScgOvRup8mHvh_2gA3ctMtqdNjOutI"
+    # CSV export URL format for Google Sheets
+    url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv"
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        # Read CSV data into pandas DataFrame
+        df = pd.read_csv(StringIO(response.text))
+        # Convert DataFrame to list of lists for Gradio
+        data = df.values.tolist()
+        headers = df.columns.tolist()
+        return data, headers
+    except Exception as e:
+        print(f"Error loading Google Sheets data: {e}")
+# Load the data from Google Sheets
+leaderboard_data, headers = load_data_from_google_sheets()
+# Create the Gradio interface
+with gr.Blocks(title="LLM Propensity Evaluation Leaderboard") as demo:
+    gr.Markdown("# 🛡️ LLM Propensity Evaluation Leaderboard")
+    gr.Markdown("Measuring propensities / alignment traits of the most downloaded models on HuggingFace")
+    # Add methodology or description
+    with gr.Accordion("📋 Evaluation Methodology", open=False):
+        gr.Markdown("""
+        **Evaluation Details:**
+        - **Instruction Following Score**: Measures a model's tendency to follow instructions accurately. Measured using the IFEval dataset.
+        - **Hallucination Rate**: Evaluates how often a model hallucinates. Measured using a subset of the SimpleQA dataset. We calculated the rate using this formula : (1 - (correct + not_attempted)), where correct = when the model answered a question correctly and not_attempted = when a model admits to not knowing the answer to a question.*
+        """)
+    # Add refresh functionality
+    def refresh_data():
+        data, cols = load_data_from_google_sheets()
+        return gr.Dataframe(value=data, headers=cols)
+    refresh_btn = gr.Button("🔄 Refresh Data")
+    # Create the leaderboard
+    leaderboard = gr.Dataframe(
+        value=leaderboard_data,
+        headers=headers,
+        datatype=["str", "number", "number"],
+        interactive=False,
+        wrap=True
+    )
+    # Connect refresh button
+    refresh_btn.click(refresh_data, outputs=leaderboard)
+    # Add footer information
+    gr.Markdown("""
+    ---
+    **Last Updated**: Sep 11, 2025
+    **Contact**: <TBD>
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+pandas
+requests