import gradio as gr import pandas as pd import requests from io import StringIO # Load data from Google Sheets def load_data_from_google_sheets(): SHEET_ID = "19HEHUtljTu1jaScgOvRup8mHvh_2gA3ctMtqdNjOutI" # CSV export URL format for Google Sheets url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv" try: response = requests.get(url) response.raise_for_status() # Read CSV data into pandas DataFrame df = pd.read_csv(StringIO(response.text)) # Convert DataFrame to list of lists for Gradio data = df.values.tolist() headers = df.columns.tolist() return data, headers except Exception as e: print(f"Error loading Google Sheets data: {e}") # Load the data from Google Sheets leaderboard_data, headers = load_data_from_google_sheets() # Create the Gradio interface with gr.Blocks(title="LLM Propensity Evaluation Leaderboard") as demo: gr.Markdown("# 🛡️ LLM Propensity Evaluation Leaderboard") gr.Markdown("Measuring propensities / alignment traits of the most downloaded models on HuggingFace") # Add methodology or description with gr.Accordion("📋 Evaluation Methodology", open=False): gr.Markdown(""" This board tracks the performance of most popular language models on key alignment traits. The evaluations are based on standardized datasets and metrics to ensure consistency and reliability. ## Who is this board for? * Researchers and developers interested in understanding the alignment characteristics of various language models they integrate into their applications. * Organizations looking to select models based on their propensity. * AI ethics and safety teams aiming to monitor and evaluate the behavior of language models in their systems. * AI regulators and policymakers interested in the alignment and safety aspects of widely used (popular) language models. ## Evaluation Details: - **Instruction Following Score**: Measures a model's tendency to follow instructions accurately. Measured using the **[IFEval](https://arxiv.org/pdf/2311.07911)** dataset. - **Uncommon Facts Hallucination Rate**: Evaluates how often a model hallucinates when questioned on facts. Measured using a subset of the **[SimpleQA](https://arxiv.org/abs/2411.04368)** dataset, which explicitly asks uncommon facts. We calculated the rate using this formula : (1 - (correct + not_attempted)), where correct = when the model answered a question correctly and not_attempted = when a model admits to not knowing the answer to a question.* - All evals have been run using the **[Inspect](https://github.com/UKGovernmentBEIS/inspect_evals)** framework from UK AISI. ## How to Interpret the Scores: * Instruction Following Score: Higher scores indicate better adherence to instructions. * Hallucination Rate: Lower rates indicate fewer hallucinations. *Note*: The evaluation metrics are designed to provide insights into the models' behavior in specific contexts. They may not capture all aspects of model performance or alignment. """) # Add refresh functionality def refresh_data(): data, cols = load_data_from_google_sheets() return gr.Dataframe(value=data, headers=cols) refresh_btn = gr.Button("🔄 Refresh Data") # Create the leaderboard leaderboard = gr.Dataframe( value=leaderboard_data, headers=headers, datatype=["str", "number", "number"], interactive=False, wrap=True ) # Connect refresh button refresh_btn.click(refresh_data, outputs=leaderboard) # Add footer information gr.Markdown(""" --- **Last Updated**: November 1, 2025 """) # Launch the app if __name__ == "__main__": demo.launch()