Spaces:

PropensityLabs
/

LLM-Propensity-Evals

Sleeping

File size: 3,993 Bytes

import gradio as gr
import pandas as pd
import requests
from io import StringIO

# Load data from Google Sheets
def load_data_from_google_sheets():
    SHEET_ID = "19HEHUtljTu1jaScgOvRup8mHvh_2gA3ctMtqdNjOutI"
    
    # CSV export URL format for Google Sheets
    url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Read CSV data into pandas DataFrame
        df = pd.read_csv(StringIO(response.text))
        
        # Convert DataFrame to list of lists for Gradio
        data = df.values.tolist()
        headers = df.columns.tolist()
        
        return data, headers
    
    except Exception as e:
        print(f"Error loading Google Sheets data: {e}")

# Load the data from Google Sheets
leaderboard_data, headers = load_data_from_google_sheets()

# Create the Gradio interface
with gr.Blocks(title="LLM Propensity Evaluation Leaderboard") as demo:
    gr.Markdown("# 🛡️ LLM Propensity Evaluation Leaderboard")
    gr.Markdown("Measuring propensities / alignment traits of the most downloaded models on HuggingFace")
    
    # Add methodology or description
    with gr.Accordion("📋 Evaluation Methodology", open=False):
        gr.Markdown("""
        
        This board tracks the performance of most popular language models on key alignment traits. The evaluations are based on standardized datasets and metrics to ensure consistency and reliability.
        
        ## Who is this board for?
        * Researchers and developers interested in understanding the alignment characteristics of various language models they integrate into their applications.
        * Organizations looking to select models based on their propensity.
        * AI ethics and safety teams aiming to monitor and evaluate the behavior of language models in their systems.
        * AI regulators and policymakers interested in the alignment and safety aspects of widely used (popular) language models.
        
        ## Evaluation Details:
        - **Instruction Following Score**: Measures a model's tendency to follow instructions accurately. Measured using the **[IFEval](https://arxiv.org/pdf/2311.07911)** dataset.
        - **Uncommon Facts Hallucination Rate**: Evaluates how often a model hallucinates when questioned on facts. Measured using a subset of the **[SimpleQA](https://arxiv.org/abs/2411.04368)** dataset, which explicitly asks uncommon facts. We calculated the rate using this formula : (1 - (correct + not_attempted)), where correct = when the model answered a question correctly and not_attempted = when a model admits to not knowing the answer to a question.*
        - All evals have been run using the **[Inspect](https://github.com/UKGovernmentBEIS/inspect_evals)** framework from UK AISI.

        ## How to Interpret the Scores:
        * Instruction Following Score: Higher scores indicate better adherence to instructions.
        * Hallucination Rate: Lower rates indicate fewer hallucinations.

        *Note*: The evaluation metrics are designed to provide insights into the models' behavior in specific contexts. They may not capture all aspects of model performance or alignment. 
                    
        """)
    
    # Add refresh functionality
    def refresh_data():
        data, cols = load_data_from_google_sheets()
        return gr.Dataframe(value=data, headers=cols)
    
    refresh_btn = gr.Button("🔄 Refresh Data")
    
    # Create the leaderboard
    leaderboard = gr.Dataframe(
        value=leaderboard_data,
        headers=headers,
        datatype=["str", "number", "number"],
        interactive=False,
        wrap=True
    )
    
    # Connect refresh button
    refresh_btn.click(refresh_data, outputs=leaderboard)
    
    # Add footer information
    gr.Markdown("""
    ---
    **Last Updated**: November 1, 2025
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()