navitrace_leaderboard / src /streamlit_app.py
TimWindecker's picture
Update src/streamlit_app.py
9db562c verified
from src.score_calculation.score import score_predictions
import ast
from datasets import load_dataset
from huggingface_hub import login
import multiprocessing
import numpy as np
import streamlit as st
from streamlit_chunk_file_uploader import uploader
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from io import StringIO
import json
import os
RESULTS_DIR = "results/"
# Page config
st.set_page_config(
page_title="NaviTrace Leaderboard",
layout="centered",
initial_sidebar_state="collapsed"
)
# Custom CSS for Nerfies-style design
st.markdown("""
<style>
/* Import Font Awesome */
@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
.header-container {
display: flex;
flex-direction: column;
align-items: center;
}
/* Headings */
h1 {
text-align: center;
font-size: 4.5rem !important;
font-weight: 500;
margin-top: 1rem;
margin-bottom: 1rem;
}
/* Links container */
.links-container {
display: flex;
flex-wrap: wrap;
row-gap: 1rem;
justify-content: center;
text-align: center;
margin-bottom: 3rem;
font-size: 1.1rem;
}
.links-container a {
white-space: nowrap;
margin: 0 1rem;
text-decoration: none;
color: #3b82f6;
font-weight: 600;
transition: color 0.3s;
}
.links-container a:hover {
color: #1e3a8a;
}
/* Instructions styling */
.instruction-item {
display: flex;
gap: 1.5rem;
margin: 2rem 0;
align-items: flex-start;
}
.instruction-number {
flex-shrink: 0;
width: 40px;
height: 40px;
border-radius: 50%;
background: linear-gradient(135deg, #3b82f6 0%, #1e3a8a 100%);
color: white;
display: flex;
align-items: center;
justify-content: center;
font-weight: 700;
font-size: 1.2rem;
}
.instruction-content {
flex-grow: 1;
padding-top: 0.3rem;
}
/* Media Query for mobile devices */
@media (max-width: 600px) {
h1 {
font-size: 3.5rem !important; /* Adjust font size for small screens */
}
}
</style>
""", unsafe_allow_html=True)
def load_data():
"""Load all result files as one data frame"""
try:
# Load all results files
all_dfs = []
for file_path in Path(RESULTS_DIR).glob('*.tsv'):
df = pd.read_csv(file_path, sep='\t')
model_name = file_path.stem.replace('_', ' ')
df["model"] = model_name
all_dfs.append(df)
# Concatenate all DataFrames into one
if all_dfs:
final_df = pd.concat(all_dfs, ignore_index=True)
return final_df
except Exception as e:
st.error(f"Error loading data: {str(e)}")
return None
def calculate_score(results_df):
"""Calculate score using private test split ground truth."""
try:
# Access to private dataset with test labels
login(token=os.environ.get("HF_TOKEN"))
dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test")
# Calculate score
return score_predictions(results_df, dataset)
except Exception as e:
st.error(f"Error calculating score")
return None
def validate_tsv_format(uploaded_file):
"""Validate that the uploaded TSV has the correct format"""
try:
df = pd.read_csv(uploaded_file, sep='\t')
# Check for required columns, data types, etc.
required_cols = ["sample_id", "embodiment", "category", "prediction"]
if not all(col in df.columns for col in required_cols):
return False, f"Missing required columns. Expected: {required_cols}"
return True, df
except Exception as e:
return False, f"Error reading file: {str(e)}"
@st.cache_data
def convert_df_to_tsv(df):
return df.to_csv(sep='\t', index=False).encode('utf-8')
def create_bar_chart(df, view_type):
"""Create interactive bar chart based on view type"""
# Copy df
df_fig = df.copy()
df_fig = df_fig[df_fig["score"] != np.inf]
# Split too long names
model_renaming_map = {
"Qwen 3 VL 235b Thinking": "Qwen 3 VL 235b<br>Thinking",
}
df_fig["model"] = df_fig["model"].map(model_renaming_map).fillna(df_fig["model"])
if view_type == "Total Score":
# Calculate mean score per model
df_fig = df_fig.groupby("model")[["score"]].mean().reset_index()
# Sort the results from best to worst
df_fig = df_fig.sort_values(by="score", ascending=False)
# Create the Plotly figure
fig = px.bar(
df_fig,
x="model",
y="score",
color="score",
color_continuous_scale=px.colors.diverging.RdYlBu,
orientation="v",
)
max_score = df_fig["score"].max()
min_score = df_fig["score"].min()
fig.update_layout(
xaxis=dict(
title=dict(
text="Model",
standoff=25,
),
tickangle=-45,
),
yaxis=dict(
title_text="Score",
range=[min_score * 1.25, max_score * 1.25]
),
title_text="",
font=dict(size=15),
bargap=0.2,
height=600,
showlegend=False,
margin=dict(
l=60, # Left
r=0, # Right
b=95, # Bottom
t=80, # Top
pad=0 # Padding
),
)
# Remove the color legend from the chart.
fig.update_coloraxes(showscale=False)
# Add annotations to show the exact score on each bar.
fig.update_traces(
texttemplate="%{y:.0f}",
textposition="outside"
)
elif view_type == "Per Embodiment":
# Calculate the model order
df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index()
model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist()
# Calculate mean score per model and embodiment
df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index()
# Convert the "model" column to a categorical type with the sorted order
df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True)
# Sort the DataFrame based on the new categorical order
df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False])
# Create the Plotly figure
fig = px.bar(
df_fig,
x="model",
y="score",
color="embodiment",
color_discrete_sequence=px.colors.qualitative.Plotly,
orientation="v",
)
max_score = df_fig["score"].max()
min_score = df_fig["score"].min()
fig.update_layout(
xaxis=dict(
title=dict(
text="Model",
standoff=25,
),
tickangle=-45,
),
yaxis=dict(
title_text="Score",
range=[min_score * 1.25, max_score * 1.25]
),
title_text="",
font=dict(size=15),
bargap=0.1,
barmode="group",
height=600,
margin=dict(
l=60, # Left
r=0, # Right
b=95, # Bottom
t=80, # Top
pad=0 # Padding
),
showlegend=True,
legend=dict(
orientation="h",
x=0.5,
y=1.1,
xanchor="center",
yanchor="top",
borderwidth=0,
itemclick="toggle",
itemdoubleclick="toggleothers",
title=dict(
text="<b>Embodiments</b>",
side="top center"
)
),
uniformtext_minsize=10,
uniformtext_mode="show",
)
# Remove the color legend from the chart.
fig.update_coloraxes(showscale=False)
else: # Per Category
# Calculate the model order
df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index()
model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist()
# Calculate mean score per model and embodiment
df_fig["category"] = df_fig["category"].apply(ast.literal_eval)
df_fig = df_fig.explode("category")
df_fig = df_fig.groupby(["model", "category"])[["score"]].mean().reset_index()
# Convert the "model" column to a categorical type with the sorted order
df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True)
# Sort the DataFrame based on the new categorical order
df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False])
# Create the Plotly figure
fig = px.bar(
df_fig,
x="model",
y="score",
color="category",
color_discrete_sequence=px.colors.qualitative.Plotly[::-1],
orientation="v",
)
max_score = df_fig["score"].max()
min_score = df_fig["score"].min()
fig.update_layout(
xaxis=dict(
title=dict(
text="Model",
standoff=25,
),
tickangle=-45,
),
yaxis=dict(
title_text="Score",
range=[min_score * 1.25, max_score * 1.25]
),
title_text="",
font=dict(size=15),
bargap=0.1,
barmode="group",
height=600,
margin=dict(
l=60, # Left
r=0, # Right
b=95, # Bottom
t=80, # Top
pad=0 # Padding
),
showlegend=True,
legend=dict(
orientation="h",
x=0.5,
y=1.1,
xanchor="center",
yanchor="top",
borderwidth=0,
itemclick="toggle",
itemdoubleclick="toggleothers",
title=dict(
text="<b>Categories</b>",
side="top center"
)
),
uniformtext_minsize=10,
uniformtext_mode="show",
)
# Remove the color legend from the chart.
fig.update_coloraxes(showscale=False)
return fig
def create_summary_table(df):
# Copy df
df_table = df.copy()
df_table = df_table[df_table["score"] != np.inf]
# Calculate total score per model
df_total = df_table.groupby("model")[["score"]].mean().reset_index()
df_total.columns = ["model", "Total Score"]
# Calculate scores per embodiment
df_embodiment = df_table.groupby(["model", "embodiment"])[["score"]].mean().reset_index()
df_embodiment_pivot = df_embodiment.pivot(index="model", columns="embodiment", values="score")
df_embodiment_pivot.columns = [f"{col}" for col in df_embodiment_pivot.columns]
# Calculate scores per category
df_category = df_table.copy()
df_category["category"] = df_category["category"].apply(ast.literal_eval)
df_category = df_category.explode("category")
df_category = df_category.groupby(["model", "category"])[["score"]].mean().reset_index()
df_category_pivot = df_category.pivot(index="model", columns="category", values="score")
df_category_pivot.columns = [f"{col}" for col in df_category_pivot.columns]
# Combine all tables
df_summary = df_total.set_index("model")
df_summary = df_summary.join(df_embodiment_pivot)
df_summary = df_summary.join(df_category_pivot)
# Sort by total score
df_summary = df_summary.sort_values(by="Total Score", ascending=False)
# Reset index to make model a column again
df_summary = df_summary.reset_index()
return df_summary
def main():
# Header
st.markdown("""
<div class="header-container">
<h1>NaviTrace Leaderboard</h1>
<div class="links-container">
<a href="https://leggedrobotics.github.io/navitrace_webpage/">
🏠 Project
</a>
<a href="https://arxiv.org/abs/2510.26909">
📄 Paper
</a>
<a href="https://github.com/leggedrobotics/navitrace_evaluation">
💻 Code
</a>
<a href="https://huggingface.co/datasets/leggedrobotics/navitrace">
💾 Dataset
</a>
</div>
</div>
""", unsafe_allow_html=True)
# Load data
df = load_data()
# Add user's model if it exists in session state
if 'user_results' in st.session_state:
user_results = pd.DataFrame(st.session_state.user_results)
df = pd.concat([user_results, df], ignore_index=True)
# View selector
view_type = st.selectbox(
"Select View",
["Total Score", "Per Embodiment", "Per Category"],
)
# Display chart
fig = create_bar_chart(df, view_type)
st.plotly_chart(fig, use_container_width=True, config={
'displayModeBar': True,
'displaylogo': False,
'toImageButtonOptions': {
'format': 'png',
'filename': 'navitrace_leaderboard',
'height': 600,
'width': 1200,
'scale': 2
}
})
# Detailed table
with st.expander("View Detailed Scores"):
# Create the summary table
df_summary = create_summary_table(df)
# Display table
st.dataframe(
df_summary.style.background_gradient(
cmap="Blues",
subset=[col for col in df_summary.columns if col != "model"]
).format("{:.2f}", subset=[col for col in df_summary.columns if col != "model"]),
width="stretch",
hide_index=True,
)
with st.expander("How to Test Your Model", expanded=True):
# Step 1
st.markdown("""
<div class="instruction-item">
<div class="instruction-number">1</div>
<div class="instruction-content">
<div><b>Run Evaluation</b></div>
<div>
Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set.
</div>
</div>
</div>
""", unsafe_allow_html=True)
st.link_button("📓 Open Evaluation Notebook", "https://github.com/leggedrobotics/navitrace_evaluation", width="stretch")
# Step 2
st.markdown("""
<div class="instruction-item">
<div class="instruction-number">2</div>
<div class="instruction-content">
<div><b>Upload Results</b></div>
<div>
Upload the TSV file generated by the evaluation notebook.
</div>
</div>
</div>
""", unsafe_allow_html=True)
# Chunk uploaded file to circumvent HF limit
#uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed")
uploaded_file = uploader("", key="chunk_uploader", chunk_size=0.5)
# Step 3
st.markdown("""
<div class="instruction-item">
<div class="instruction-number">3</div>
<div class="instruction-content">
<div><b>Calculate Score</b></div>
<div>
Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths.
</div>
</div>
</div>
""", unsafe_allow_html=True)
if uploaded_file is not None:
if st.button("🧮 Calculate Score", width="stretch"):
# Validate format
with st.spinner("Validating format and calculating score..."):
is_valid, result = validate_tsv_format(uploaded_file)
if is_valid:
# Calculate score using hidden ground-truth
scores = calculate_score(result)
if scores is not None:
# Store in session state
scores["model"] = "Your Model"
st.session_state.user_results = scores.to_dict(orient='list')
st.rerun()
else:
st.error(f"❌ Invalid file format: {result}")
else:
st.info("👆 Upload a TSV file to calculate your score")
# Allow download of results
if 'user_results' in st.session_state:
user_results = pd.DataFrame(st.session_state.user_results)
st.success(f"✅ Score calculated successfully: **{user_results['score'].mean():.1f}**")
st.info("👆 Scroll up to see your model on the leaderboard!")
tsv_data = convert_df_to_tsv(user_results)
st.download_button(
label="🏅 Download Score",
data=tsv_data,
file_name='scores.tsv',
mime='text/tab-separated-values',
width="stretch",
)
# Step 4
st.markdown("""
<div class="instruction-item">
<div class="instruction-number">4</div>
<div class="instruction-content">
<div><b>Submit to Official Leaderboard</b></div>
<div>
Happy with your score? Submit your model to appear on the official leaderboard.
Fill out the form below with your model details and results.
</div>
</div>
</div>
""", unsafe_allow_html=True)
st.link_button("🗳️ Submit Model", "https://docs.google.com/forms/d/e/1FAIpQLSfcAQ6JW7eey-8OFSAz2ea_StCezxJK1dt6mjW_wR-9jCHnXg/viewform?usp=dialog", width="stretch")
if __name__ == "__main__":
main()