|
|
from src.score_calculation.score import score_predictions |
|
|
import ast |
|
|
from datasets import load_dataset |
|
|
import multiprocessing |
|
|
import numpy as np |
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
from pathlib import Path |
|
|
import plotly.graph_objects as go |
|
|
import plotly.express as px |
|
|
from io import StringIO |
|
|
import json |
|
|
|
|
|
RESULTS_DIR = "results/" |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="NaviTrace Leaderboard", |
|
|
layout="centered", |
|
|
initial_sidebar_state="collapsed" |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
/* Import Font Awesome */ |
|
|
@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css'); |
|
|
|
|
|
.header-container { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
align-items: center; |
|
|
} |
|
|
|
|
|
/* Headings */ |
|
|
h1 { |
|
|
text-align: center; |
|
|
font-size: 4.5rem !important; |
|
|
font-weight: 500; |
|
|
margin-top: 1rem; |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
|
|
|
/* Links container */ |
|
|
.links-container { |
|
|
display: flex; |
|
|
flex-wrap: wrap; |
|
|
row-gap: 1rem; |
|
|
justify-content: center; |
|
|
text-align: center; |
|
|
margin-bottom: 3rem; |
|
|
font-size: 1.1rem; |
|
|
} |
|
|
|
|
|
.links-container a { |
|
|
white-space: nowrap; |
|
|
margin: 0 1rem; |
|
|
text-decoration: none; |
|
|
color: #3b82f6; |
|
|
font-weight: 600; |
|
|
transition: color 0.3s; |
|
|
} |
|
|
|
|
|
.links-container a:hover { |
|
|
color: #1e3a8a; |
|
|
} |
|
|
|
|
|
/* Instructions styling */ |
|
|
.instruction-item { |
|
|
display: flex; |
|
|
gap: 1.5rem; |
|
|
margin: 2rem 0; |
|
|
align-items: flex-start; |
|
|
} |
|
|
|
|
|
.instruction-number { |
|
|
flex-shrink: 0; |
|
|
width: 40px; |
|
|
height: 40px; |
|
|
border-radius: 50%; |
|
|
background: linear-gradient(135deg, #3b82f6 0%, #1e3a8a 100%); |
|
|
color: white; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
font-weight: 700; |
|
|
font-size: 1.2rem; |
|
|
} |
|
|
|
|
|
.instruction-content { |
|
|
flex-grow: 1; |
|
|
padding-top: 0.3rem; |
|
|
} |
|
|
|
|
|
/* Media Query for mobile devices */ |
|
|
@media (max-width: 600px) { |
|
|
h1 { |
|
|
font-size: 3.5rem !important; /* Adjust font size for small screens */ |
|
|
} |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
def load_data(): |
|
|
"""Load all result files as one data frame""" |
|
|
|
|
|
try: |
|
|
|
|
|
all_dfs = [] |
|
|
for file_path in Path(RESULTS_DIR).glob('*.tsv'): |
|
|
df = pd.read_csv(file_path, sep='\t') |
|
|
model_name = file_path.stem.replace('_', ' ') |
|
|
df["model"] = model_name |
|
|
all_dfs.append(df) |
|
|
|
|
|
|
|
|
if all_dfs: |
|
|
final_df = pd.concat(all_dfs, ignore_index=True) |
|
|
|
|
|
return final_df |
|
|
except Exception as e: |
|
|
st.error(f"Error loading data: {str(e)}") |
|
|
return None |
|
|
|
|
|
def calculate_score(results_df): |
|
|
"""Calculate score using private test split ground truth.""" |
|
|
|
|
|
try: |
|
|
|
|
|
login(token=os.environ.get("HF_TOKEN")) |
|
|
dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test") |
|
|
|
|
|
|
|
|
return score_predictions(results_df, dataset) |
|
|
except Exception as e: |
|
|
st.error(f"Error calculating score: {str(e)}") |
|
|
return None |
|
|
|
|
|
def validate_tsv_format(uploaded_file): |
|
|
"""Validate that the uploaded TSV has the correct format""" |
|
|
|
|
|
try: |
|
|
df = pd.read_csv(uploaded_file, sep='\t') |
|
|
|
|
|
required_cols = ["sample_id", "embodiment", "category", "prediction"] |
|
|
if not all(col in df.columns for col in required_cols): |
|
|
return False, f"Missing required columns. Expected: {required_cols}" |
|
|
return True, df |
|
|
except Exception as e: |
|
|
return False, f"Error reading file: {str(e)}" |
|
|
|
|
|
def create_bar_chart(df, view_type): |
|
|
"""Create interactive bar chart based on view type""" |
|
|
|
|
|
if view_type == "Total Score": |
|
|
|
|
|
|
|
|
df_fig = df.copy() |
|
|
df_fig = df_fig[df_fig["score"] != np.inf] |
|
|
|
|
|
|
|
|
df_fig = df_fig.groupby("model")[["score"]].mean().reset_index() |
|
|
|
|
|
|
|
|
df_fig = df_fig.sort_values(by="score", ascending=True) |
|
|
|
|
|
|
|
|
fig = px.bar( |
|
|
df_fig, |
|
|
x="model", |
|
|
y="score", |
|
|
color="score", |
|
|
color_continuous_scale=px.colors.diverging.Fall, |
|
|
orientation="v", |
|
|
) |
|
|
max_score = df_fig["score"].max() |
|
|
fig.update_layout( |
|
|
xaxis=dict( |
|
|
title=dict( |
|
|
text="Model", |
|
|
standoff=25, |
|
|
), |
|
|
tickangle=-45, |
|
|
), |
|
|
yaxis=dict( |
|
|
title_text="Score (Lower is better)", |
|
|
range=[0, max_score * 1.25] |
|
|
), |
|
|
title_text="", |
|
|
font=dict(size=15), |
|
|
bargap=0.2, |
|
|
height=600, |
|
|
showlegend=False, |
|
|
margin=dict( |
|
|
l=60, |
|
|
r=0, |
|
|
b=95, |
|
|
t=80, |
|
|
pad=0 |
|
|
), |
|
|
) |
|
|
|
|
|
|
|
|
fig.update_coloraxes(showscale=False) |
|
|
|
|
|
|
|
|
fig.update_traces( |
|
|
texttemplate="%{y:.2f}", |
|
|
textposition="outside" |
|
|
) |
|
|
|
|
|
elif view_type == "Per Embodiment": |
|
|
|
|
|
|
|
|
df_fig = df.copy() |
|
|
df_fig = df_fig[df_fig["score"] != np.inf] |
|
|
|
|
|
|
|
|
df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() |
|
|
model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() |
|
|
|
|
|
|
|
|
df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index() |
|
|
|
|
|
|
|
|
df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) |
|
|
|
|
|
|
|
|
df_fig = df_fig.sort_values(by=["model", "score"], ascending=[True, True]) |
|
|
|
|
|
|
|
|
fig = px.bar( |
|
|
df_fig, |
|
|
x="model", |
|
|
y="score", |
|
|
color="embodiment", |
|
|
color_discrete_sequence=px.colors.qualitative.Plotly, |
|
|
orientation="v", |
|
|
) |
|
|
max_score = df_fig["score"].max() |
|
|
fig.update_layout( |
|
|
xaxis=dict( |
|
|
title=dict( |
|
|
text="Model", |
|
|
standoff=25, |
|
|
), |
|
|
tickangle=-45, |
|
|
), |
|
|
yaxis=dict( |
|
|
title_text="Score (Lower is better)", |
|
|
range=[0, max_score * 1.25] |
|
|
), |
|
|
title_text="", |
|
|
font=dict(size=15), |
|
|
bargap=0.1, |
|
|
barmode="group", |
|
|
height=600, |
|
|
margin=dict( |
|
|
l=60, |
|
|
r=0, |
|
|
b=95, |
|
|
t=80, |
|
|
pad=0 |
|
|
), |
|
|
showlegend=True, |
|
|
legend=dict( |
|
|
orientation="h", |
|
|
x=0.5, |
|
|
y=1.1, |
|
|
xanchor="center", |
|
|
yanchor="top", |
|
|
borderwidth=0, |
|
|
itemclick="toggle", |
|
|
itemdoubleclick="toggleothers", |
|
|
title=dict( |
|
|
text="<b>Embodiments</b>", |
|
|
side="top center" |
|
|
) |
|
|
), |
|
|
uniformtext_minsize=10, |
|
|
uniformtext_mode="show", |
|
|
) |
|
|
|
|
|
|
|
|
fig.update_coloraxes(showscale=False) |
|
|
|
|
|
|
|
|
fig.update_traces( |
|
|
texttemplate="%{y:.2f}", |
|
|
textposition="outside", |
|
|
textangle=-90, |
|
|
) |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
df_fig = df.copy() |
|
|
df_fig = df_fig[df_fig["score"] != np.inf] |
|
|
|
|
|
|
|
|
df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() |
|
|
model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() |
|
|
|
|
|
|
|
|
df_fig["category"] = df_fig["category"].apply(ast.literal_eval) |
|
|
df_fig = df_fig.explode("category") |
|
|
df_fig = df_fig.groupby(["model", "category"])[["score"]].mean().reset_index() |
|
|
|
|
|
|
|
|
df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) |
|
|
|
|
|
|
|
|
df_fig = df_fig.sort_values(by=["model", "score"], ascending=[True, True]) |
|
|
|
|
|
|
|
|
fig = px.bar( |
|
|
df_fig, |
|
|
x="model", |
|
|
y="score", |
|
|
color="category", |
|
|
color_discrete_sequence=px.colors.qualitative.Plotly[::-1], |
|
|
orientation="v", |
|
|
) |
|
|
max_score = df_fig["score"].max() |
|
|
fig.update_layout( |
|
|
xaxis=dict( |
|
|
title=dict( |
|
|
text="Model", |
|
|
standoff=25, |
|
|
), |
|
|
tickangle=-45, |
|
|
), |
|
|
yaxis=dict( |
|
|
title_text="Score (Lower is better)", |
|
|
range=[0, max_score * 1.25] |
|
|
), |
|
|
title_text="", |
|
|
font=dict(size=15), |
|
|
bargap=0.1, |
|
|
barmode="group", |
|
|
height=600, |
|
|
margin=dict( |
|
|
l=60, |
|
|
r=0, |
|
|
b=95, |
|
|
t=80, |
|
|
pad=0 |
|
|
), |
|
|
showlegend=True, |
|
|
legend=dict( |
|
|
orientation="h", |
|
|
x=0.5, |
|
|
y=1.1, |
|
|
xanchor="center", |
|
|
yanchor="top", |
|
|
borderwidth=0, |
|
|
itemclick="toggle", |
|
|
itemdoubleclick="toggleothers", |
|
|
title=dict( |
|
|
text="<b>Categories</b>", |
|
|
side="top center" |
|
|
) |
|
|
), |
|
|
uniformtext_minsize=10, |
|
|
uniformtext_mode="show", |
|
|
) |
|
|
|
|
|
|
|
|
fig.update_coloraxes(showscale=False) |
|
|
|
|
|
|
|
|
fig.update_traces( |
|
|
texttemplate="%{y:.2f}", |
|
|
textposition="outside", |
|
|
textangle=-90, |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
def create_summary_table(df): |
|
|
|
|
|
|
|
|
df_table = df.copy() |
|
|
df_table = df_table[df_table["score"] != np.inf] |
|
|
|
|
|
|
|
|
df_total = df_table.groupby("model")[["score"]].mean().reset_index() |
|
|
df_total.columns = ["model", "Total Score"] |
|
|
|
|
|
|
|
|
df_embodiment = df_table.groupby(["model", "embodiment"])[["score"]].mean().reset_index() |
|
|
df_embodiment_pivot = df_embodiment.pivot(index="model", columns="embodiment", values="score") |
|
|
df_embodiment_pivot.columns = [f"{col}" for col in df_embodiment_pivot.columns] |
|
|
|
|
|
|
|
|
df_category = df_table.copy() |
|
|
df_category["category"] = df_category["category"].apply(ast.literal_eval) |
|
|
df_category = df_category.explode("category") |
|
|
df_category = df_category.groupby(["model", "category"])[["score"]].mean().reset_index() |
|
|
df_category_pivot = df_category.pivot(index="model", columns="category", values="score") |
|
|
df_category_pivot.columns = [f"{col}" for col in df_category_pivot.columns] |
|
|
|
|
|
|
|
|
df_summary = df_total.set_index("model") |
|
|
df_summary = df_summary.join(df_embodiment_pivot) |
|
|
df_summary = df_summary.join(df_category_pivot) |
|
|
|
|
|
|
|
|
df_summary = df_summary.sort_values(by="Total Score", ascending=True) |
|
|
|
|
|
|
|
|
df_summary = df_summary.reset_index() |
|
|
|
|
|
return df_summary |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="header-container"> |
|
|
<h1>NaviTrace Leaderboard</h1> |
|
|
<div class="links-container"> |
|
|
<a href="https://leggedrobotics.github.io/navitrace_webpage/"> |
|
|
🏠 Project |
|
|
</a> |
|
|
<a href="#TODO"> |
|
|
📄 Paper |
|
|
</a> |
|
|
<a href="#TODO"> |
|
|
💻 Code |
|
|
</a> |
|
|
<a href="https://huggingface.co/datasets/leggedrobotics/navitrace"> |
|
|
💾 Dataset |
|
|
</a> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
df = load_data() |
|
|
|
|
|
|
|
|
if 'user_results' in st.session_state: |
|
|
user_results = pd.DataFrame([st.session_state.user_results]) |
|
|
df = pd.concat([user_results, df], ignore_index=True) |
|
|
|
|
|
|
|
|
view_type = st.selectbox( |
|
|
"Select View", |
|
|
["Total Score", "Per Embodiment", "Per Category"], |
|
|
) |
|
|
|
|
|
|
|
|
fig = create_bar_chart(df, view_type) |
|
|
st.plotly_chart(fig, use_container_width=True, config={ |
|
|
'displayModeBar': True, |
|
|
'displaylogo': False, |
|
|
'toImageButtonOptions': { |
|
|
'format': 'png', |
|
|
'filename': 'navitrace_leaderboard', |
|
|
'height': 600, |
|
|
'width': 1200, |
|
|
'scale': 2 |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
with st.expander("View Detailed Scores"): |
|
|
|
|
|
df_summary = create_summary_table(df) |
|
|
|
|
|
|
|
|
st.dataframe( |
|
|
df_summary.style.background_gradient( |
|
|
cmap="Blues_r", |
|
|
subset=[col for col in df_summary.columns if col != "model"] |
|
|
).format("{:.2f}", subset=[col for col in df_summary.columns if col != "model"]), |
|
|
use_container_width=True, |
|
|
hide_index=True, |
|
|
) |
|
|
|
|
|
with st.expander("How to Test Your Model", expanded=True): |
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">1</div> |
|
|
<div class="instruction-content"> |
|
|
<div><b>Run Evaluation</b></div> |
|
|
<div> |
|
|
Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.link_button("📓 Open Evaluation Notebook", "https://colab.research.google.com/your-notebook-link", width="stretch") |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">2</div> |
|
|
<div class="instruction-content"> |
|
|
<div><b>Upload Results</b></div> |
|
|
<div> |
|
|
Upload the TSV file generated by the evaluation notebook. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed") |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">3</div> |
|
|
<div class="instruction-content"> |
|
|
<div><b>Calculate Score</b></div> |
|
|
<div> |
|
|
Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
if st.button("🧮 Calculate Score", width="stretch"): |
|
|
with st.spinner("Validating and calculating scores..."): |
|
|
|
|
|
is_valid, result = validate_tsv_format(uploaded_file) |
|
|
if is_valid: |
|
|
|
|
|
scores = calculate_score(result) |
|
|
if scores is not None: |
|
|
st.success(f"✅ Score calculated successfully: **{scores['Total Score']:.1f}**") |
|
|
|
|
|
|
|
|
st.session_state.user_results = { |
|
|
"model": "Your Model", |
|
|
**scores |
|
|
} |
|
|
st.info("👆 Scroll up to see your model on the leaderboard!") |
|
|
st.rerun() |
|
|
else: |
|
|
st.error(f"❌ Invalid file format: {result}") |
|
|
else: |
|
|
st.info("👆 Upload a TSV file to calculate your score") |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">4</div> |
|
|
<div class="instruction-content"> |
|
|
<div><b>Submit to Official Leaderboard</b></div> |
|
|
<div> |
|
|
Happy with your score? Submit your model to appear on the official leaderboard. |
|
|
Fill out the form below with your model details and results. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.link_button("🗳️ Submit Model", "https://docs.google.com/forms/d/e/1FAIpQLSfcAQ6JW7eey-8OFSAz2ea_StCezxJK1dt6mjW_wR-9jCHnXg/viewform?usp=dialog", width="stretch") |
|
|
|