Spaces:

leggedrobotics
/

navitrace_leaderboard

Running

App Files Files Community

navitrace_leaderboard / src /streamlit_app.py

TimWindecker

Update src/streamlit_app.py

9db562c verified 30 days ago

raw

history blame contribute delete

18.8 kB

	from src.score_calculation.score import score_predictions
	import ast
	from datasets import load_dataset
	from huggingface_hub import login
	import multiprocessing
	import numpy as np
	import streamlit as st
	from streamlit_chunk_file_uploader import uploader
	import pandas as pd
	from pathlib import Path
	import plotly.graph_objects as go
	import plotly.express as px
	from io import StringIO
	import json
	import os


	RESULTS_DIR = "results/"

	# Page config
	st.set_page_config(
	page_title="NaviTrace Leaderboard",
	layout="centered",
	initial_sidebar_state="collapsed"
	)

	# Custom CSS for Nerfies-style design
	st.markdown("""
	<style>
	/* Import Font Awesome */
	@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');

	.header-container {
	display: flex;
	flex-direction: column;
	align-items: center;
	}

	/* Headings */
	h1 {
	text-align: center;
	font-size: 4.5rem !important;
	font-weight: 500;
	margin-top: 1rem;
	margin-bottom: 1rem;
	}

	/* Links container */
	.links-container {
	display: flex;
	flex-wrap: wrap;
	row-gap: 1rem;
	justify-content: center;
	text-align: center;
	margin-bottom: 3rem;
	font-size: 1.1rem;
	}

	.links-container a {
	white-space: nowrap;
	margin: 0 1rem;
	text-decoration: none;
	color: #3b82f6;
	font-weight: 600;
	transition: color 0.3s;
	}

	.links-container a:hover {
	color: #1e3a8a;
	}

	/* Instructions styling */
	.instruction-item {
	display: flex;
	gap: 1.5rem;
	margin: 2rem 0;
	align-items: flex-start;
	}

	.instruction-number {
	flex-shrink: 0;
	width: 40px;
	height: 40px;
	border-radius: 50%;
	background: linear-gradient(135deg, #3b82f6 0%, #1e3a8a 100%);
	color: white;
	display: flex;
	align-items: center;
	justify-content: center;
	font-weight: 700;
	font-size: 1.2rem;
	}

	.instruction-content {
	flex-grow: 1;
	padding-top: 0.3rem;
	}

	/* Media Query for mobile devices */
	@media (max-width: 600px) {
	h1 {
	font-size: 3.5rem !important; /* Adjust font size for small screens */
	}
	}
	</style>
	""", unsafe_allow_html=True)

	def load_data():
	"""Load all result files as one data frame"""

	try:
	# Load all results files
	all_dfs = []
	for file_path in Path(RESULTS_DIR).glob('*.tsv'):
	df = pd.read_csv(file_path, sep='\t')
	model_name = file_path.stem.replace('_', ' ')
	df["model"] = model_name
	all_dfs.append(df)

	# Concatenate all DataFrames into one
	if all_dfs:
	final_df = pd.concat(all_dfs, ignore_index=True)

	return final_df
	except Exception as e:
	st.error(f"Error loading data: {str(e)}")
	return None

	def calculate_score(results_df):
	"""Calculate score using private test split ground truth."""

	try:
	# Access to private dataset with test labels
	login(token=os.environ.get("HF_TOKEN"))
	dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test")

	# Calculate score
	return score_predictions(results_df, dataset)
	except Exception as e:
	st.error(f"Error calculating score")
	return None

	def validate_tsv_format(uploaded_file):
	"""Validate that the uploaded TSV has the correct format"""

	try:
	df = pd.read_csv(uploaded_file, sep='\t')
	# Check for required columns, data types, etc.
	required_cols = ["sample_id", "embodiment", "category", "prediction"]
	if not all(col in df.columns for col in required_cols):
	return False, f"Missing required columns. Expected: {required_cols}"
	return True, df
	except Exception as e:
	return False, f"Error reading file: {str(e)}"

	@st.cache_data
	def convert_df_to_tsv(df):
	return df.to_csv(sep='\t', index=False).encode('utf-8')

	def create_bar_chart(df, view_type):
	"""Create interactive bar chart based on view type"""

	# Copy df
	df_fig = df.copy()
	df_fig = df_fig[df_fig["score"] != np.inf]

	# Split too long names
	model_renaming_map = {
	"Qwen 3 VL 235b Thinking": "Qwen 3 VL 235b<br>Thinking",
	}
	df_fig["model"] = df_fig["model"].map(model_renaming_map).fillna(df_fig["model"])

	if view_type == "Total Score":

	# Calculate mean score per model
	df_fig = df_fig.groupby("model")[["score"]].mean().reset_index()

	# Sort the results from best to worst
	df_fig = df_fig.sort_values(by="score", ascending=False)

	# Create the Plotly figure
	fig = px.bar(
	df_fig,
	x="model",
	y="score",
	color="score",
	color_continuous_scale=px.colors.diverging.RdYlBu,
	orientation="v",
	)
	max_score = df_fig["score"].max()
	min_score = df_fig["score"].min()
	fig.update_layout(
	xaxis=dict(
	title=dict(
	text="Model",
	standoff=25,
	),
	tickangle=-45,
	),
	yaxis=dict(
	title_text="Score",
	range=[min_score * 1.25, max_score * 1.25]
	),
	title_text="",
	font=dict(size=15),
	bargap=0.2,
	height=600,
	showlegend=False,
	margin=dict(
	l=60, # Left
	r=0, # Right
	b=95, # Bottom
	t=80, # Top
	pad=0 # Padding
	),
	)

	# Remove the color legend from the chart.
	fig.update_coloraxes(showscale=False)

	# Add annotations to show the exact score on each bar.
	fig.update_traces(
	texttemplate="%{y:.0f}",
	textposition="outside"
	)

	elif view_type == "Per Embodiment":

	# Calculate the model order
	df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index()
	model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist()

	# Calculate mean score per model and embodiment
	df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index()

	# Convert the "model" column to a categorical type with the sorted order
	df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True)

	# Sort the DataFrame based on the new categorical order
	df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False])

	# Create the Plotly figure
	fig = px.bar(
	df_fig,
	x="model",
	y="score",
	color="embodiment",
	color_discrete_sequence=px.colors.qualitative.Plotly,
	orientation="v",
	)
	max_score = df_fig["score"].max()
	min_score = df_fig["score"].min()
	fig.update_layout(
	xaxis=dict(
	title=dict(
	text="Model",
	standoff=25,
	),
	tickangle=-45,
	),
	yaxis=dict(
	title_text="Score",
	range=[min_score * 1.25, max_score * 1.25]
	),
	title_text="",
	font=dict(size=15),
	bargap=0.1,
	barmode="group",
	height=600,
	margin=dict(
	l=60, # Left
	r=0, # Right
	b=95, # Bottom
	t=80, # Top
	pad=0 # Padding
	),
	showlegend=True,
	legend=dict(
	orientation="h",
	x=0.5,
	y=1.1,
	xanchor="center",
	yanchor="top",
	borderwidth=0,
	itemclick="toggle",
	itemdoubleclick="toggleothers",
	title=dict(
	text="<b>Embodiments</b>",
	side="top center"
	)
	),
	uniformtext_minsize=10,
	uniformtext_mode="show",
	)

	# Remove the color legend from the chart.
	fig.update_coloraxes(showscale=False)

	else: # Per Category

	# Calculate the model order
	df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index()
	model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist()

	# Calculate mean score per model and embodiment
	df_fig["category"] = df_fig["category"].apply(ast.literal_eval)
	df_fig = df_fig.explode("category")
	df_fig = df_fig.groupby(["model", "category"])[["score"]].mean().reset_index()

	# Convert the "model" column to a categorical type with the sorted order
	df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True)

	# Sort the DataFrame based on the new categorical order
	df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False])

	# Create the Plotly figure
	fig = px.bar(
	df_fig,
	x="model",
	y="score",
	color="category",
	color_discrete_sequence=px.colors.qualitative.Plotly[::-1],
	orientation="v",
	)
	max_score = df_fig["score"].max()
	min_score = df_fig["score"].min()
	fig.update_layout(
	xaxis=dict(
	title=dict(
	text="Model",
	standoff=25,
	),
	tickangle=-45,
	),
	yaxis=dict(
	title_text="Score",
	range=[min_score * 1.25, max_score * 1.25]
	),
	title_text="",
	font=dict(size=15),
	bargap=0.1,
	barmode="group",
	height=600,
	margin=dict(
	l=60, # Left
	r=0, # Right
	b=95, # Bottom
	t=80, # Top
	pad=0 # Padding
	),
	showlegend=True,
	legend=dict(
	orientation="h",
	x=0.5,
	y=1.1,
	xanchor="center",
	yanchor="top",
	borderwidth=0,
	itemclick="toggle",
	itemdoubleclick="toggleothers",
	title=dict(
	text="<b>Categories</b>",
	side="top center"
	)
	),
	uniformtext_minsize=10,
	uniformtext_mode="show",
	)

	# Remove the color legend from the chart.
	fig.update_coloraxes(showscale=False)

	return fig

	def create_summary_table(df):

	# Copy df
	df_table = df.copy()
	df_table = df_table[df_table["score"] != np.inf]

	# Calculate total score per model
	df_total = df_table.groupby("model")[["score"]].mean().reset_index()
	df_total.columns = ["model", "Total Score"]

	# Calculate scores per embodiment
	df_embodiment = df_table.groupby(["model", "embodiment"])[["score"]].mean().reset_index()
	df_embodiment_pivot = df_embodiment.pivot(index="model", columns="embodiment", values="score")
	df_embodiment_pivot.columns = [f"{col}" for col in df_embodiment_pivot.columns]

	# Calculate scores per category
	df_category = df_table.copy()
	df_category["category"] = df_category["category"].apply(ast.literal_eval)
	df_category = df_category.explode("category")
	df_category = df_category.groupby(["model", "category"])[["score"]].mean().reset_index()
	df_category_pivot = df_category.pivot(index="model", columns="category", values="score")
	df_category_pivot.columns = [f"{col}" for col in df_category_pivot.columns]

	# Combine all tables
	df_summary = df_total.set_index("model")
	df_summary = df_summary.join(df_embodiment_pivot)
	df_summary = df_summary.join(df_category_pivot)

	# Sort by total score
	df_summary = df_summary.sort_values(by="Total Score", ascending=False)

	# Reset index to make model a column again
	df_summary = df_summary.reset_index()

	return df_summary

	def main():

	# Header
	st.markdown("""
	<div class="header-container">
	<h1>NaviTrace Leaderboard</h1>
	<div class="links-container">
	<a href="https://leggedrobotics.github.io/navitrace_webpage/">
	🏠 Project
	</a>
	<a href="https://arxiv.org/abs/2510.26909">
	📄 Paper
	</a>
	<a href="https://github.com/leggedrobotics/navitrace_evaluation">
	💻 Code
	</a>
	<a href="https://huggingface.co/datasets/leggedrobotics/navitrace">
	💾 Dataset
	</a>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Load data
	df = load_data()

	# Add user's model if it exists in session state
	if 'user_results' in st.session_state:
	user_results = pd.DataFrame(st.session_state.user_results)
	df = pd.concat([user_results, df], ignore_index=True)

	# View selector
	view_type = st.selectbox(
	"Select View",
	["Total Score", "Per Embodiment", "Per Category"],
	)

	# Display chart
	fig = create_bar_chart(df, view_type)
	st.plotly_chart(fig, use_container_width=True, config={
	'displayModeBar': True,
	'displaylogo': False,
	'toImageButtonOptions': {
	'format': 'png',
	'filename': 'navitrace_leaderboard',
	'height': 600,
	'width': 1200,
	'scale': 2
	}
	})

	# Detailed table
	with st.expander("View Detailed Scores"):
	# Create the summary table
	df_summary = create_summary_table(df)

	# Display table
	st.dataframe(
	df_summary.style.background_gradient(
	cmap="Blues",
	subset=[col for col in df_summary.columns if col != "model"]
	).format("{:.2f}", subset=[col for col in df_summary.columns if col != "model"]),
	width="stretch",
	hide_index=True,
	)

	with st.expander("How to Test Your Model", expanded=True):
	# Step 1
	st.markdown("""
	<div class="instruction-item">
	<div class="instruction-number">1</div>
	<div class="instruction-content">
	<div><b>Run Evaluation</b></div>
	<div>
	Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set.
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	st.link_button("📓 Open Evaluation Notebook", "https://github.com/leggedrobotics/navitrace_evaluation", width="stretch")

	# Step 2
	st.markdown("""
	<div class="instruction-item">
	<div class="instruction-number">2</div>
	<div class="instruction-content">
	<div><b>Upload Results</b></div>
	<div>
	Upload the TSV file generated by the evaluation notebook.
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Chunk uploaded file to circumvent HF limit
	#uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed")
	uploaded_file = uploader("", key="chunk_uploader", chunk_size=0.5)

	# Step 3
	st.markdown("""
	<div class="instruction-item">
	<div class="instruction-number">3</div>
	<div class="instruction-content">
	<div><b>Calculate Score</b></div>
	<div>
	Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths.
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	if uploaded_file is not None:
	if st.button("🧮 Calculate Score", width="stretch"):
	# Validate format
	with st.spinner("Validating format and calculating score..."):
	is_valid, result = validate_tsv_format(uploaded_file)
	if is_valid:
	# Calculate score using hidden ground-truth
	scores = calculate_score(result)
	if scores is not None:
	# Store in session state
	scores["model"] = "Your Model"
	st.session_state.user_results = scores.to_dict(orient='list')
	st.rerun()
	else:
	st.error(f"❌ Invalid file format: {result}")
	else:
	st.info("👆 Upload a TSV file to calculate your score")

	# Allow download of results
	if 'user_results' in st.session_state:
	user_results = pd.DataFrame(st.session_state.user_results)
	st.success(f"✅ Score calculated successfully: {user_results['score'].mean():.1f}")
	st.info("👆 Scroll up to see your model on the leaderboard!")
	tsv_data = convert_df_to_tsv(user_results)
	st.download_button(
	label="🏅 Download Score",
	data=tsv_data,
	file_name='scores.tsv',
	mime='text/tab-separated-values',
	width="stretch",
	)

	# Step 4
	st.markdown("""
	<div class="instruction-item">
	<div class="instruction-number">4</div>
	<div class="instruction-content">
	<div><b>Submit to Official Leaderboard</b></div>
	<div>
	Happy with your score? Submit your model to appear on the official leaderboard.
	Fill out the form below with your model details and results.
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	st.link_button("🗳️ Submit Model", "https://docs.google.com/forms/d/e/1FAIpQLSfcAQ6JW7eey-8OFSAz2ea_StCezxJK1dt6mjW_wR-9jCHnXg/viewform?usp=dialog", width="stretch")

	if __name__ == "__main__":
	main()