from src.score_calculation.score import score_predictions import ast from datasets import load_dataset import multiprocessing import numpy as np import streamlit as st import pandas as pd from pathlib import Path import plotly.graph_objects as go import plotly.express as px from io import StringIO import json RESULTS_DIR = "results/" # Page config st.set_page_config( page_title="NaviTrace Leaderboard", layout="centered", initial_sidebar_state="collapsed" ) # Custom CSS for Nerfies-style design st.markdown(""" """, unsafe_allow_html=True) def load_data(): """Load all result files as one data frame""" try: # Load all results files all_dfs = [] for file_path in Path(RESULTS_DIR).glob('*.tsv'): df = pd.read_csv(file_path, sep='\t') model_name = file_path.stem.replace('_', ' ') df["model"] = model_name all_dfs.append(df) # Concatenate all DataFrames into one if all_dfs: final_df = pd.concat(all_dfs, ignore_index=True) return final_df except Exception as e: st.error(f"Error loading data: {str(e)}") return None def calculate_score(results_df): """Calculate score using private test split ground truth.""" try: # Access to private dataset with test labels login(token=os.environ.get("HF_TOKEN")) dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test") # Calculate score return score_predictions(results_df, dataset) except Exception as e: st.error(f"Error calculating score: {str(e)}") return None def validate_tsv_format(uploaded_file): """Validate that the uploaded TSV has the correct format""" try: df = pd.read_csv(uploaded_file, sep='\t') # Check for required columns, data types, etc. required_cols = ["sample_id", "embodiment", "category", "prediction"] if not all(col in df.columns for col in required_cols): return False, f"Missing required columns. Expected: {required_cols}" return True, df except Exception as e: return False, f"Error reading file: {str(e)}" def create_bar_chart(df, view_type): """Create interactive bar chart based on view type""" if view_type == "Total Score": # Copy df df_fig = df.copy() df_fig = df_fig[df_fig["score"] != np.inf] # Calculate mean score per model df_fig = df_fig.groupby("model")[["score"]].mean().reset_index() # Sort the results from best to worst df_fig = df_fig.sort_values(by="score", ascending=True) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="score", color_continuous_scale=px.colors.diverging.Fall, orientation="v", ) max_score = df_fig["score"].max() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=25, ), tickangle=-45, ), yaxis=dict( title_text="Score (Lower is better)", range=[0, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.2, height=600, showlegend=False, margin=dict( l=60, # Left r=0, # Right b=95, # Bottom t=80, # Top pad=0 # Padding ), ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) # Add annotations to show the exact score on each bar. fig.update_traces( texttemplate="%{y:.2f}", textposition="outside" ) elif view_type == "Per Embodiment": # Copy df df_fig = df.copy() df_fig = df_fig[df_fig["score"] != np.inf] # Calculate the model order df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() # Calculate mean score per model and embodiment df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index() # Convert the "model" column to a categorical type with the sorted order df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) # Sort the DataFrame based on the new categorical order df_fig = df_fig.sort_values(by=["model", "score"], ascending=[True, True]) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="embodiment", color_discrete_sequence=px.colors.qualitative.Plotly, orientation="v", ) max_score = df_fig["score"].max() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=25, ), tickangle=-45, ), yaxis=dict( title_text="Score (Lower is better)", range=[0, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.1, barmode="group", height=600, margin=dict( l=60, # Left r=0, # Right b=95, # Bottom t=80, # Top pad=0 # Padding ), showlegend=True, legend=dict( orientation="h", x=0.5, y=1.1, xanchor="center", yanchor="top", borderwidth=0, itemclick="toggle", itemdoubleclick="toggleothers", title=dict( text="Embodiments", side="top center" ) ), uniformtext_minsize=10, uniformtext_mode="show", ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) # Add annotations to show the exact score on each bar. fig.update_traces( texttemplate="%{y:.2f}", textposition="outside", textangle=-90, ) else: # Per Category # Copy df df_fig = df.copy() df_fig = df_fig[df_fig["score"] != np.inf] # Calculate the model order df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() # Calculate mean score per model and embodiment df_fig["category"] = df_fig["category"].apply(ast.literal_eval) df_fig = df_fig.explode("category") df_fig = df_fig.groupby(["model", "category"])[["score"]].mean().reset_index() # Convert the "model" column to a categorical type with the sorted order df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) # Sort the DataFrame based on the new categorical order df_fig = df_fig.sort_values(by=["model", "score"], ascending=[True, True]) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="category", color_discrete_sequence=px.colors.qualitative.Plotly[::-1], orientation="v", ) max_score = df_fig["score"].max() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=25, ), tickangle=-45, ), yaxis=dict( title_text="Score (Lower is better)", range=[0, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.1, barmode="group", height=600, margin=dict( l=60, # Left r=0, # Right b=95, # Bottom t=80, # Top pad=0 # Padding ), showlegend=True, legend=dict( orientation="h", x=0.5, y=1.1, xanchor="center", yanchor="top", borderwidth=0, itemclick="toggle", itemdoubleclick="toggleothers", title=dict( text="Categories", side="top center" ) ), uniformtext_minsize=10, uniformtext_mode="show", ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) # Add annotations to show the exact score on each bar. fig.update_traces( texttemplate="%{y:.2f}", textposition="outside", textangle=-90, ) return fig def create_summary_table(df): # Copy df df_table = df.copy() df_table = df_table[df_table["score"] != np.inf] # Calculate total score per model df_total = df_table.groupby("model")[["score"]].mean().reset_index() df_total.columns = ["model", "Total Score"] # Calculate scores per embodiment df_embodiment = df_table.groupby(["model", "embodiment"])[["score"]].mean().reset_index() df_embodiment_pivot = df_embodiment.pivot(index="model", columns="embodiment", values="score") df_embodiment_pivot.columns = [f"{col}" for col in df_embodiment_pivot.columns] # Calculate scores per category df_category = df_table.copy() df_category["category"] = df_category["category"].apply(ast.literal_eval) df_category = df_category.explode("category") df_category = df_category.groupby(["model", "category"])[["score"]].mean().reset_index() df_category_pivot = df_category.pivot(index="model", columns="category", values="score") df_category_pivot.columns = [f"{col}" for col in df_category_pivot.columns] # Combine all tables df_summary = df_total.set_index("model") df_summary = df_summary.join(df_embodiment_pivot) df_summary = df_summary.join(df_category_pivot) # Sort by total score df_summary = df_summary.sort_values(by="Total Score", ascending=True) # Reset index to make model a column again df_summary = df_summary.reset_index() return df_summary # Header st.markdown("""

NaviTrace Leaderboard

""", unsafe_allow_html=True) # Load data df = load_data() # Add user's model if it exists in session state if 'user_results' in st.session_state: user_results = pd.DataFrame([st.session_state.user_results]) df = pd.concat([user_results, df], ignore_index=True) # View selector view_type = st.selectbox( "Select View", ["Total Score", "Per Embodiment", "Per Category"], ) # Display chart fig = create_bar_chart(df, view_type) st.plotly_chart(fig, use_container_width=True, config={ 'displayModeBar': True, 'displaylogo': False, 'toImageButtonOptions': { 'format': 'png', 'filename': 'navitrace_leaderboard', 'height': 600, 'width': 1200, 'scale': 2 } }) # Detailed table with st.expander("View Detailed Scores"): # Create the summary table df_summary = create_summary_table(df) # Display table st.dataframe( df_summary.style.background_gradient( cmap="Blues_r", subset=[col for col in df_summary.columns if col != "model"] ).format("{:.2f}", subset=[col for col in df_summary.columns if col != "model"]), use_container_width=True, hide_index=True, ) with st.expander("How to Test Your Model", expanded=True): # Step 1 st.markdown("""
1
Run Evaluation
Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set.
""", unsafe_allow_html=True) st.link_button("📓 Open Evaluation Notebook", "https://colab.research.google.com/your-notebook-link", width="stretch") # Step 2 st.markdown("""
2
Upload Results
Upload the TSV file generated by the evaluation notebook.
""", unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed") # Step 3 st.markdown("""
3
Calculate Score
Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths.
""", unsafe_allow_html=True) if uploaded_file is not None: if st.button("🧮 Calculate Score", width="stretch"): with st.spinner("Validating and calculating scores..."): # Validate format is_valid, result = validate_tsv_format(uploaded_file) if is_valid: # Calculate score using hidden ground-truth scores = calculate_score(result) if scores is not None: st.success(f"✅ Score calculated successfully: **{scores['Total Score']:.1f}**") # Store in session state st.session_state.user_results = { "model": "Your Model", **scores } st.info("👆 Scroll up to see your model on the leaderboard!") st.rerun() else: st.error(f"❌ Invalid file format: {result}") else: st.info("👆 Upload a TSV file to calculate your score") # Step 4 st.markdown("""
4
Submit to Official Leaderboard
Happy with your score? Submit your model to appear on the official leaderboard. Fill out the form below with your model details and results.
""", unsafe_allow_html=True) st.link_button("🗳️ Submit Model", "https://docs.google.com/forms/d/e/1FAIpQLSfcAQ6JW7eey-8OFSAz2ea_StCezxJK1dt6mjW_wR-9jCHnXg/viewform?usp=dialog", width="stretch")