|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
st.set_page_config(layout="wide", page_title="TranslateBench EN-ES Leaderboard") |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_data_from_hf(): |
|
|
"""Loads and preprocesses the benchmark data from Hugging Face.""" |
|
|
try: |
|
|
st.info("Fetching data from Hugging Face (Thermostatic/TranslateBench-EN-ES)... This may take a moment.") |
|
|
|
|
|
|
|
|
|
|
|
dataset_dict = load_dataset("Thermostatic/TranslateBench-EN-ES", data_files="model_benchmark_summary.csv") |
|
|
|
|
|
|
|
|
if 'train' in dataset_dict: |
|
|
dataset = dataset_dict['train'] |
|
|
else: |
|
|
|
|
|
|
|
|
first_split_name = list(dataset_dict.keys())[0] |
|
|
dataset = dataset_dict[first_split_name] |
|
|
st.warning(f"Using split '{first_split_name}' as 'train' split was not found.") |
|
|
|
|
|
df = dataset.to_pandas() |
|
|
st.success("Data loaded successfully from Hugging Face!") |
|
|
|
|
|
|
|
|
|
|
|
df['Provider'] = df['Model Name'].apply(lambda x: x.split('_')[0].capitalize()) |
|
|
|
|
|
score_cols = ['Weighted Score', 'BLEU', 'METEOR', 'COMET'] |
|
|
for col in score_cols: |
|
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
return df |
|
|
except Exception as e: |
|
|
st.error(f"An error occurred while loading or processing data from Hugging Face: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
st.title("๐ TranslateBench EN-ES Leaderboard") |
|
|
st.markdown(""" |
|
|
This leaderboard shows the performance of various models on the English-to-Spanish translation task. |
|
|
Data is sourced directly from the [Thermostatic/TranslateBench-EN-ES](https://huggingface.co/datasets/Thermostatic/TranslateBench-EN-ES) dataset on Hugging Face. |
|
|
You can sort the table by different metrics and filter by model provider. |
|
|
""") |
|
|
|
|
|
|
|
|
data_df = load_data_from_hf() |
|
|
|
|
|
if data_df is not None: |
|
|
|
|
|
st.sidebar.header("โ๏ธ Leaderboard Controls") |
|
|
|
|
|
|
|
|
sortable_metrics = ['Weighted Score', 'BLEU', 'METEOR', 'COMET'] |
|
|
sort_by = st.sidebar.selectbox("Sort by Metric:", sortable_metrics, index=0) |
|
|
|
|
|
|
|
|
sort_order_asc = st.sidebar.radio("Sort Order:", ("Descending (Best First)", "Ascending (Worst First)"), index=0) |
|
|
is_ascending = True if sort_order_asc == "Ascending (Worst First)" else False |
|
|
|
|
|
|
|
|
all_providers = sorted(data_df['Provider'].unique()) |
|
|
selected_providers = st.sidebar.multiselect( |
|
|
"Filter by Provider:", |
|
|
options=all_providers, |
|
|
default=all_providers |
|
|
) |
|
|
|
|
|
if not selected_providers: |
|
|
st.warning("Please select at least one provider to display results.") |
|
|
filtered_df = pd.DataFrame(columns=data_df.columns) |
|
|
else: |
|
|
filtered_df = data_df[data_df['Provider'].isin(selected_providers)] |
|
|
|
|
|
|
|
|
if not filtered_df.empty: |
|
|
sorted_df = filtered_df.sort_values(by=sort_by, ascending=is_ascending).reset_index(drop=True) |
|
|
|
|
|
sorted_df.insert(0, 'Rank', range(1, 1 + len(sorted_df))) |
|
|
else: |
|
|
sorted_df = filtered_df |
|
|
|
|
|
|
|
|
|
|
|
st.header("๐ฅ Top Performer") |
|
|
if not sorted_df.empty: |
|
|
top_model = sorted_df.iloc[0] |
|
|
st.metric( |
|
|
label=f"Best Model by {sort_by}", |
|
|
value=top_model['Model Name'], |
|
|
delta=f"{top_model[sort_by]:.4f} ({sort_by})", |
|
|
delta_color="off" |
|
|
) |
|
|
|
|
|
cols = st.columns(len(sortable_metrics)) |
|
|
for i, metric in enumerate(sortable_metrics): |
|
|
with cols[i]: |
|
|
if metric in top_model: |
|
|
st.metric(label=metric, value=f"{top_model[metric]:.4f}") |
|
|
else: |
|
|
st.metric(label=metric, value="N/A") |
|
|
else: |
|
|
st.info("No data to display for top performer based on current filters.") |
|
|
|
|
|
|
|
|
|
|
|
st.header("๐ Full Leaderboard") |
|
|
|
|
|
|
|
|
display_columns = ['Rank', 'Model Name', 'Provider'] + sortable_metrics |
|
|
|
|
|
|
|
|
formatter = {col: "{:.4f}" for col in sortable_metrics} |
|
|
|
|
|
if not sorted_df.empty: |
|
|
|
|
|
existing_display_columns = [col for col in display_columns if col in sorted_df.columns] |
|
|
st.dataframe( |
|
|
sorted_df[existing_display_columns].style.format(formatter), |
|
|
use_container_width=True, |
|
|
hide_index=True, |
|
|
) |
|
|
else: |
|
|
st.info("No models match the current filter criteria.") |
|
|
|
|
|
|
|
|
if st.checkbox("Show Raw Data (Downloaded, Unsorted, Unfiltered)"): |
|
|
st.subheader("Raw Data") |
|
|
st.dataframe(data_df) |
|
|
|
|
|
else: |
|
|
st.warning("Data could not be loaded from Hugging Face. Please check the console for errors, your internet connection, and ensure the dataset/file path is correct.") |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("Created with Streamlit, Pandas, and data from Hugging Face Datasets.") |