from src.score_calculation.score import score_predictions import ast from datasets import load_dataset from huggingface_hub import login import multiprocessing import numpy as np import streamlit as st from streamlit_chunk_file_uploader import uploader import pandas as pd from pathlib import Path import plotly.graph_objects as go import plotly.express as px from io import StringIO import json import os RESULTS_DIR = "results/" # Page config st.set_page_config( page_title="NaviTrace Leaderboard", layout="centered", initial_sidebar_state="collapsed" ) # Custom CSS for Nerfies-style design st.markdown(""" """, unsafe_allow_html=True) def load_data(): """Load all result files as one data frame""" try: # Load all results files all_dfs = [] for file_path in Path(RESULTS_DIR).glob('*.tsv'): df = pd.read_csv(file_path, sep='\t') model_name = file_path.stem.replace('_', ' ') df["model"] = model_name all_dfs.append(df) # Concatenate all DataFrames into one if all_dfs: final_df = pd.concat(all_dfs, ignore_index=True) return final_df except Exception as e: st.error(f"Error loading data: {str(e)}") return None def calculate_score(results_df): """Calculate score using private test split ground truth.""" try: # Access to private dataset with test labels login(token=os.environ.get("HF_TOKEN")) dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test") # Calculate score return score_predictions(results_df, dataset) except Exception as e: st.error(f"Error calculating score") return None def validate_tsv_format(uploaded_file): """Validate that the uploaded TSV has the correct format""" try: df = pd.read_csv(uploaded_file, sep='\t') # Check for required columns, data types, etc. required_cols = ["sample_id", "embodiment", "category", "prediction"] if not all(col in df.columns for col in required_cols): return False, f"Missing required columns. Expected: {required_cols}" return True, df except Exception as e: return False, f"Error reading file: {str(e)}" @st.cache_data def convert_df_to_tsv(df): return df.to_csv(sep='\t', index=False).encode('utf-8') def create_bar_chart(df, view_type): """Create interactive bar chart based on view type""" # Copy df df_fig = df.copy() df_fig = df_fig[df_fig["score"] != np.inf] # Split too long names model_renaming_map = { "Qwen 3 VL 235b Thinking": "Qwen 3 VL 235b
Thinking", } df_fig["model"] = df_fig["model"].map(model_renaming_map).fillna(df_fig["model"]) if view_type == "Total Score": # Calculate mean score per model df_fig = df_fig.groupby("model")[["score"]].mean().reset_index() # Sort the results from best to worst df_fig = df_fig.sort_values(by="score", ascending=False) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="score", color_continuous_scale=px.colors.diverging.RdYlBu, orientation="v", ) max_score = df_fig["score"].max() min_score = df_fig["score"].min() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=25, ), tickangle=-45, ), yaxis=dict( title_text="Score", range=[min_score * 1.25, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.2, height=600, showlegend=False, margin=dict( l=60, # Left r=0, # Right b=95, # Bottom t=80, # Top pad=0 # Padding ), ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) # Add annotations to show the exact score on each bar. fig.update_traces( texttemplate="%{y:.0f}", textposition="outside" ) elif view_type == "Per Embodiment": # Calculate the model order df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() # Calculate mean score per model and embodiment df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index() # Convert the "model" column to a categorical type with the sorted order df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) # Sort the DataFrame based on the new categorical order df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False]) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="embodiment", color_discrete_sequence=px.colors.qualitative.Plotly, orientation="v", ) max_score = df_fig["score"].max() min_score = df_fig["score"].min() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=25, ), tickangle=-45, ), yaxis=dict( title_text="Score", range=[min_score * 1.25, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.1, barmode="group", height=600, margin=dict( l=60, # Left r=0, # Right b=95, # Bottom t=80, # Top pad=0 # Padding ), showlegend=True, legend=dict( orientation="h", x=0.5, y=1.1, xanchor="center", yanchor="top", borderwidth=0, itemclick="toggle", itemdoubleclick="toggleothers", title=dict( text="Embodiments", side="top center" ) ), uniformtext_minsize=10, uniformtext_mode="show", ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) else: # Per Category # Calculate the model order df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() # Calculate mean score per model and embodiment df_fig["category"] = df_fig["category"].apply(ast.literal_eval) df_fig = df_fig.explode("category") df_fig = df_fig.groupby(["model", "category"])[["score"]].mean().reset_index() # Convert the "model" column to a categorical type with the sorted order df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) # Sort the DataFrame based on the new categorical order df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False]) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="category", color_discrete_sequence=px.colors.qualitative.Plotly[::-1], orientation="v", ) max_score = df_fig["score"].max() min_score = df_fig["score"].min() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=25, ), tickangle=-45, ), yaxis=dict( title_text="Score", range=[min_score * 1.25, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.1, barmode="group", height=600, margin=dict( l=60, # Left r=0, # Right b=95, # Bottom t=80, # Top pad=0 # Padding ), showlegend=True, legend=dict( orientation="h", x=0.5, y=1.1, xanchor="center", yanchor="top", borderwidth=0, itemclick="toggle", itemdoubleclick="toggleothers", title=dict( text="Categories", side="top center" ) ), uniformtext_minsize=10, uniformtext_mode="show", ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) return fig def create_summary_table(df): # Copy df df_table = df.copy() df_table = df_table[df_table["score"] != np.inf] # Calculate total score per model df_total = df_table.groupby("model")[["score"]].mean().reset_index() df_total.columns = ["model", "Total Score"] # Calculate scores per embodiment df_embodiment = df_table.groupby(["model", "embodiment"])[["score"]].mean().reset_index() df_embodiment_pivot = df_embodiment.pivot(index="model", columns="embodiment", values="score") df_embodiment_pivot.columns = [f"{col}" for col in df_embodiment_pivot.columns] # Calculate scores per category df_category = df_table.copy() df_category["category"] = df_category["category"].apply(ast.literal_eval) df_category = df_category.explode("category") df_category = df_category.groupby(["model", "category"])[["score"]].mean().reset_index() df_category_pivot = df_category.pivot(index="model", columns="category", values="score") df_category_pivot.columns = [f"{col}" for col in df_category_pivot.columns] # Combine all tables df_summary = df_total.set_index("model") df_summary = df_summary.join(df_embodiment_pivot) df_summary = df_summary.join(df_category_pivot) # Sort by total score df_summary = df_summary.sort_values(by="Total Score", ascending=False) # Reset index to make model a column again df_summary = df_summary.reset_index() return df_summary def main(): # Header st.markdown(""" """, unsafe_allow_html=True) # Load data df = load_data() # Add user's model if it exists in session state if 'user_results' in st.session_state: user_results = pd.DataFrame(st.session_state.user_results) df = pd.concat([user_results, df], ignore_index=True) # View selector view_type = st.selectbox( "Select View", ["Total Score", "Per Embodiment", "Per Category"], ) # Display chart fig = create_bar_chart(df, view_type) st.plotly_chart(fig, use_container_width=True, config={ 'displayModeBar': True, 'displaylogo': False, 'toImageButtonOptions': { 'format': 'png', 'filename': 'navitrace_leaderboard', 'height': 600, 'width': 1200, 'scale': 2 } }) # Detailed table with st.expander("View Detailed Scores"): # Create the summary table df_summary = create_summary_table(df) # Display table st.dataframe( df_summary.style.background_gradient( cmap="Blues", subset=[col for col in df_summary.columns if col != "model"] ).format("{:.2f}", subset=[col for col in df_summary.columns if col != "model"]), width="stretch", hide_index=True, ) with st.expander("How to Test Your Model", expanded=True): # Step 1 st.markdown("""

Run Evaluation

Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set.

""", unsafe_allow_html=True) st.link_button("📓 Open Evaluation Notebook", "https://github.com/leggedrobotics/navitrace_evaluation", width="stretch") # Step 2 st.markdown("""

Upload Results

Upload the TSV file generated by the evaluation notebook.

""", unsafe_allow_html=True) # Chunk uploaded file to circumvent HF limit #uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed") uploaded_file = uploader("", key="chunk_uploader", chunk_size=0.5) # Step 3 st.markdown("""

Calculate Score

Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths.

""", unsafe_allow_html=True) if uploaded_file is not None: if st.button("🧮 Calculate Score", width="stretch"): # Validate format with st.spinner("Validating format and calculating score..."): is_valid, result = validate_tsv_format(uploaded_file) if is_valid: # Calculate score using hidden ground-truth scores = calculate_score(result) if scores is not None: # Store in session state scores["model"] = "Your Model" st.session_state.user_results = scores.to_dict(orient='list') st.rerun() else: st.error(f"❌ Invalid file format: {result}") else: st.info("👆 Upload a TSV file to calculate your score") # Allow download of results if 'user_results' in st.session_state: user_results = pd.DataFrame(st.session_state.user_results) st.success(f"✅ Score calculated successfully: **{user_results['score'].mean():.1f}**") st.info("👆 Scroll up to see your model on the leaderboard!") tsv_data = convert_df_to_tsv(user_results) st.download_button( label="🏅 Download Score", data=tsv_data, file_name='scores.tsv', mime='text/tab-separated-values', width="stretch", ) # Step 4 st.markdown("""

Submit to Official Leaderboard

Happy with your score? Submit your model to appear on the official leaderboard. Fill out the form below with your model details and results.

""", unsafe_allow_html=True) st.link_button("🗳️ Submit Model", "https://docs.google.com/forms/d/e/1FAIpQLSfcAQ6JW7eey-8OFSAz2ea_StCezxJK1dt6mjW_wR-9jCHnXg/viewform?usp=dialog", width="stretch") if __name__ == "__main__": main()