Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from io import StringIO | |
| # Load data from Google Sheets | |
| def load_data_from_google_sheets(): | |
| SHEET_ID = "19HEHUtljTu1jaScgOvRup8mHvh_2gA3ctMtqdNjOutI" | |
| # CSV export URL format for Google Sheets | |
| url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| # Read CSV data into pandas DataFrame | |
| df = pd.read_csv(StringIO(response.text)) | |
| # Convert DataFrame to list of lists for Gradio | |
| data = df.values.tolist() | |
| headers = df.columns.tolist() | |
| return data, headers | |
| except Exception as e: | |
| print(f"Error loading Google Sheets data: {e}") | |
| # Load the data from Google Sheets | |
| leaderboard_data, headers = load_data_from_google_sheets() | |
| # Create the Gradio interface | |
| with gr.Blocks(title="LLM Propensity Evaluation Leaderboard") as demo: | |
| gr.Markdown("# π‘οΈ LLM Propensity Evaluation Leaderboard") | |
| gr.Markdown("Measuring propensities / alignment traits of the most downloaded models on HuggingFace") | |
| # Add methodology or description | |
| with gr.Accordion("π Evaluation Methodology", open=False): | |
| gr.Markdown(""" | |
| This board tracks the performance of most popular language models on key alignment traits. The evaluations are based on standardized datasets and metrics to ensure consistency and reliability. | |
| ## Who is this board for? | |
| * Researchers and developers interested in understanding the alignment characteristics of various language models they integrate into their applications. | |
| * Organizations looking to select models based on their propensity. | |
| * AI ethics and safety teams aiming to monitor and evaluate the behavior of language models in their systems. | |
| * AI regulators and policymakers interested in the alignment and safety aspects of widely used (popular) language models. | |
| ## Evaluation Details: | |
| - **Instruction Following Score**: Measures a model's tendency to follow instructions accurately. Measured using the **[IFEval](https://arxiv.org/pdf/2311.07911)** dataset. | |
| - **Uncommon Facts Hallucination Rate**: Evaluates how often a model hallucinates when questioned on facts. Measured using a subset of the **[SimpleQA](https://arxiv.org/abs/2411.04368)** dataset, which explicitly asks uncommon facts. We calculated the rate using this formula : (1 - (correct + not_attempted)), where correct = when the model answered a question correctly and not_attempted = when a model admits to not knowing the answer to a question.* | |
| - All evals have been run using the **[Inspect](https://github.com/UKGovernmentBEIS/inspect_evals)** framework from UK AISI. | |
| ## How to Interpret the Scores: | |
| * Instruction Following Score: Higher scores indicate better adherence to instructions. | |
| * Hallucination Rate: Lower rates indicate fewer hallucinations. | |
| *Note*: The evaluation metrics are designed to provide insights into the models' behavior in specific contexts. They may not capture all aspects of model performance or alignment. | |
| """) | |
| # Add refresh functionality | |
| def refresh_data(): | |
| data, cols = load_data_from_google_sheets() | |
| return gr.Dataframe(value=data, headers=cols) | |
| refresh_btn = gr.Button("π Refresh Data") | |
| # Create the leaderboard | |
| leaderboard = gr.Dataframe( | |
| value=leaderboard_data, | |
| headers=headers, | |
| datatype=["str", "number", "number"], | |
| interactive=False, | |
| wrap=True | |
| ) | |
| # Connect refresh button | |
| refresh_btn.click(refresh_data, outputs=leaderboard) | |
| # Add footer information | |
| gr.Markdown(""" | |
| --- | |
| **Last Updated**: November 1, 2025 | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |