File size: 6,095 Bytes
759ec0e
9fcf507
4a1f5da
9fcf507
 
 
 
 
4a1f5da
 
 
9fcf507
4a1f5da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fcf507
 
 
fd3f7e0
9fcf507
 
 
 
4a1f5da
9fcf507
 
 
 
 
 
4a1f5da
9fcf507
 
 
 
4a1f5da
9fcf507
 
 
 
 
 
fd3f7e0
 
9fcf507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1f5da
9fcf507
 
 
4a1f5da
 
 
 
9fcf507
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1f5da
 
9fcf507
4a1f5da
9fcf507
 
 
 
 
 
 
4a1f5da
9fcf507
 
 
 
4a1f5da
759ec0e
9fcf507
4a1f5da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
import pandas as pd
from datasets import load_dataset # Import the Hugging Face datasets library

# Page configuration
st.set_page_config(layout="wide", page_title="TranslateBench EN-ES Leaderboard")

# Caching the data loading function
@st.cache_data # Use st.cache_data for dataframes and serializable objects
def load_data_from_hf():
    """Loads and preprocesses the benchmark data from Hugging Face."""
    try:
        st.info("Fetching data from Hugging Face (Thermostatic/TranslateBench-EN-ES)... This may take a moment.")
        # Load the specific CSV file from the dataset
        # The 'data_files' argument points to the specific file within the dataset repository.
        # 'load_dataset' returns a DatasetDict. For a single CSV, it's typically under the 'train' key.
        dataset_dict = load_dataset("Thermostatic/TranslateBench-EN-ES", data_files="model_benchmark_summary.csv")
        
        # Access the dataset (it will be the 'train' split by default for a single file)
        if 'train' in dataset_dict:
            dataset = dataset_dict['train']
        else:
            # Fallback in case the default split name isn't 'train'
            # This gets the first (and likely only) key in the DatasetDict
            first_split_name = list(dataset_dict.keys())[0]
            dataset = dataset_dict[first_split_name]
            st.warning(f"Using split '{first_split_name}' as 'train' split was not found.")

        df = dataset.to_pandas()
        st.success("Data loaded successfully from Hugging Face!")

        # --- Preprocessing (same as your original code) ---
        # Extract provider from Model Name
        df['Provider'] = df['Model Name'].apply(lambda x: x.split('_')[0].capitalize())
        # Ensure score columns are numeric
        score_cols = ['Weighted Score', 'BLEU', 'METEOR', 'COMET']
        for col in score_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        return df
    except Exception as e:
        st.error(f"An error occurred while loading or processing data from Hugging Face: {e}")
        return None

# --- Main Application ---
st.title("🏆 TranslateBench EN-ES Leaderboard")
st.markdown("""
This leaderboard shows the performance of various models on the English-to-Spanish translation task.
Data is sourced directly from the [Thermostatic/TranslateBench-EN-ES](https://huggingface.co/datasets/Thermostatic/TranslateBench-EN-ES) dataset on Hugging Face.
You can sort the table by different metrics and filter by model provider.
""")

# Load data
data_df = load_data_from_hf()

if data_df is not None:
    # --- Sidebar for Controls ---
    st.sidebar.header("⚙️ Leaderboard Controls")

    # Metric selection for sorting
    sortable_metrics = ['Weighted Score', 'BLEU', 'METEOR', 'COMET']
    sort_by = st.sidebar.selectbox("Sort by Metric:", sortable_metrics, index=0) # Default to Weighted Score

    # Sort order
    sort_order_asc = st.sidebar.radio("Sort Order:", ("Descending (Best First)", "Ascending (Worst First)"), index=0)
    is_ascending = True if sort_order_asc == "Ascending (Worst First)" else False

    # Provider filter
    all_providers = sorted(data_df['Provider'].unique())
    selected_providers = st.sidebar.multiselect(
        "Filter by Provider:",
        options=all_providers,
        default=all_providers
    )

    if not selected_providers:
        st.warning("Please select at least one provider to display results.")
        filtered_df = pd.DataFrame(columns=data_df.columns) # Empty df
    else:
        filtered_df = data_df[data_df['Provider'].isin(selected_providers)]

    # Apply sorting
    if not filtered_df.empty:
        sorted_df = filtered_df.sort_values(by=sort_by, ascending=is_ascending).reset_index(drop=True)
        # Add Rank column (1-based)
        sorted_df.insert(0, 'Rank', range(1, 1 + len(sorted_df)))
    else:
        sorted_df = filtered_df # Still empty if no providers selected or no data after filter


    # --- Display Top Performer ---
    st.header("🥇 Top Performer")
    if not sorted_df.empty:
        top_model = sorted_df.iloc[0]
        st.metric(
            label=f"Best Model by {sort_by}",
            value=top_model['Model Name'],
            delta=f"{top_model[sort_by]:.4f} ({sort_by})",
            delta_color="off" # No up/down arrow needed here
        )
        # Ensure all sortable_metrics exist in the top_model Series before trying to access them
        cols = st.columns(len(sortable_metrics))
        for i, metric in enumerate(sortable_metrics):
            with cols[i]:
                if metric in top_model:
                    st.metric(label=metric, value=f"{top_model[metric]:.4f}")
                else:
                    st.metric(label=metric, value="N/A")
    else:
        st.info("No data to display for top performer based on current filters.")


    # --- Display Leaderboard Table ---
    st.header("📊 Full Leaderboard")

    # Columns to display in the table
    display_columns = ['Rank', 'Model Name', 'Provider'] + sortable_metrics
    
    # Formatting for score columns (4 decimal places)
    formatter = {col: "{:.4f}" for col in sortable_metrics}

    if not sorted_df.empty:
        # Ensure only existing columns are selected for display
        existing_display_columns = [col for col in display_columns if col in sorted_df.columns]
        st.dataframe(
            sorted_df[existing_display_columns].style.format(formatter),
            use_container_width=True,
            hide_index=True,
        )
    else:
        st.info("No models match the current filter criteria.")

    # --- Show Raw Data (Optional) ---
    if st.checkbox("Show Raw Data (Downloaded, Unsorted, Unfiltered)"):
        st.subheader("Raw Data")
        st.dataframe(data_df)

else:
    st.warning("Data could not be loaded from Hugging Face. Please check the console for errors, your internet connection, and ensure the dataset/file path is correct.")

st.markdown("---")
st.markdown("Created with Streamlit, Pandas, and data from Hugging Face Datasets.")