from src.score_calculation.score import score_predictions from datasets import load_dataset import multiprocessing import numpy as np import streamlit as st import pandas as pd from pathlib import Path import plotly.graph_objects as go import plotly.express as px from io import StringIO import json RESULTS_DIR = "results/" # Page config st.set_page_config( page_title="NaviTrace Leaderboard", layout="centered", initial_sidebar_state="collapsed" ) # Custom CSS for Nerfies-style design st.markdown(""" """, unsafe_allow_html=True) def load_data(): """Load all result files as one data frame""" try: # Load all results files all_dfs = [] for file_path in Path(RESULTS_DIR).glob('*.tsv'): df = pd.read_csv(file_path, sep='\t') model_name = file_path.stem.replace('_', ' ') df["model"] = model_name all_dfs.append(df) # Concatenate all DataFrames into one if all_dfs: final_df = pd.concat(all_dfs, ignore_index=True) return final_df except Exception as e: st.error(f"Error loading data: {str(e)}") return None def calculate_score(results_df): """Calculate score using private test split ground truth.""" try: # Access to private dataset with test labels login(token=os.environ.get("HF_TOKEN")) dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test") # Calculate score return score_predictions(results_df, dataset) except Exception as e: st.error(f"Error calculating score: {str(e)}") return None def validate_tsv_format(uploaded_file): """Validate that the uploaded TSV has the correct format""" try: df = pd.read_csv(uploaded_file, sep='\t') # Check for required columns, data types, etc. required_cols = ["sample_id", "embodiment", "category", "prediction"] if not all(col in df.columns for col in required_cols): return False, f"Missing required columns. Expected: {required_cols}" return True, df except Exception as e: return False, f"Error reading file: {str(e)}" def create_bar_chart(df, view_type): """Create interactive bar chart based on view type""" if view_type == "Total Score": # Format df df_fig = df.copy() df_fig = df_fig[df_fig["score"] != np.inf] # Calculate mean score per model df_fig = df_fig.groupby("model")[["score"]].mean().reset_index() # Sort the results from best to worst df_fig = df_fig.sort_values(by="score", ascending=True) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="score", color_continuous_scale=px.colors.diverging.Fall, orientation="v", ) max_score = df_fig["score"].max() fig.update_layout( xaxis_title_text="Model", yaxis_title_text="Score (Lower is better)", title_text="", font=dict(size=15), xaxis_tickangle=-45, bargap=0.2, height=600, showlegend=False, margin=dict( l=80, # Left r=0, # Right b=80, # Bottom t=70, # Top pad=0 # Padding ), yaxis_range=[0, max_score * 1.25], ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) # Add annotations to show the exact score on each bar. fig.update_traces( texttemplate="%{y:.2f}", textposition="outside" ) elif view_type == "Per Embodiment": # Format df df_fig = df.copy() df_fig = df_fig[df_fig["score"] != np.inf] # Calculate the model order df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index() model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist() # Calculate mean score per model and embodiment df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index() # Sort the results from best to worst df_fig = df_fig.sort_values(by="score", ascending=True) # Convert the "model" column to a categorical type with the sorted order df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True) # Sort the DataFrame based on the new categorical order df_fig = df_fig.sort_values(by=["model", "score"], ascending=[True, True]) # Create the Plotly figure fig = px.bar( df_fig, x="model", y="score", color="embodiment", color_discrete_sequence=px.colors.qualitative.Plotly, orientation="v", ) max_score = df_fig["score"].max() fig.update_layout( xaxis=dict( title=dict( text="Model", standoff=10, ), tickangle=-45, ), yaxis=dict( title_text="Score (Lower is better)", range=[0, max_score * 1.25] ), title_text="", font=dict(size=15), bargap=0.1, barmode="group", height=600, margin=dict( l=80, # Left r=0, # Right b=80, # Bottom t=70, # Top pad=0 # Padding ), showlegend=True, legend=dict( orientation="h", x=0.5, y=1, xanchor="center", yanchor="bottom", borderwidth=0, itemclick="toggle", itemdoubleclick="toggleothers", title=dict( text="Embodiments", side="top center" ) ), uniformtext_minsize=10, uniformtext_mode="show", ) # Remove the color legend from the chart. fig.update_coloraxes(showscale=False) # Add annotations to show the exact score on each bar. fig.update_traces( texttemplate="%{y:.2f}", textposition="outside", textangle=-90, ) else: # Per Category # category_cols = [col for col in df.columns if col.startswith('Category-')] fig = go.Figure() # for col in category_cols: # fig.add_trace(go.Bar( # name=col.replace('Category-', ''), # x=df['Model'], # y=df[col], # orientation='v', # marker_color=px.colors.qualitative.Plotly, # text=df[col].round(1), # textposition='outside', # )) # fig.update_layout( # title="Model Performance - Per Category", # xaxis_title="Model", # yaxis_title="Score", # yaxis_range=[0, 100], # barmode='group', # height=500, # ) # Common styling # TODO fig.update_layout( # plot_bgcolor='rgba(0,0,0,0)', # paper_bgcolor='rgba(0,0,0,0)', # showlegend=(view_type != "Total Score"), # margin=dict( # l=0, # left # r=0, # right # b=0, # bottom # t=5, # top # pad=0 # padding # ), # ) # fig.update_xaxes(showgrid=False) # fig.update_yaxes(showgrid=True, gridcolor='lightgray', gridwidth=0.5) return fig # Header st.markdown(""" """, unsafe_allow_html=True) # Load data df = load_data() # Add user's model if it exists in session state if 'user_results' in st.session_state: user_results = pd.DataFrame([st.session_state.user_results]) df = pd.concat([user_results, df], ignore_index=True) # View selector view_type = st.selectbox( "Select View", ["Total Score", "Per Embodiment", "Per Category"], ) # Display chart fig = create_bar_chart(df, view_type) st.plotly_chart(fig, use_container_width=True, config={ 'displayModeBar': True, 'displaylogo': False, 'toImageButtonOptions': { 'format': 'png', 'filename': 'navitrace_leaderboard', 'height': 600, 'width': 1200, 'scale': 2 } }) st.caption("🔹 Note: Lower scores indicate better performance.") # Detailed table with st.expander("View Detailed Scores"): pass #TODO st.dataframe(df.style.background_gradient(cmap='Blues_r', subset=df.columns[1:]), width="stretch") with st.expander("How to Test Your Model", expanded=True): # Step 1 st.markdown("""

Run Evaluation

Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set.

""", unsafe_allow_html=True) st.link_button("📓 Open Evaluation Notebook", "https://colab.research.google.com/your-notebook-link", width="stretch") # Step 2 st.markdown("""

Upload Results

Upload the TSV file generated by the evaluation notebook.

""", unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed") # Step 3 st.markdown("""

Calculate Score

Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths.

""", unsafe_allow_html=True) if uploaded_file is not None: if st.button("🧮 Calculate Score", width="stretch"): with st.spinner("Validating and calculating scores..."): # Validate format is_valid, result = validate_tsv_format(uploaded_file) if is_valid: # Calculate score using hidden ground-truth scores = calculate_score(result) if scores is not None: st.success(f"✅ Score calculated successfully: **{scores['Total Score']:.1f}**") # Store in session state st.session_state.user_results = { "model": "Your Model", **scores } st.info("👆 Scroll up to see your model on the leaderboard!") st.rerun() else: st.error(f"❌ Invalid file format: {result}") else: st.info("👆 Upload a TSV file to calculate your score") # Step 4 st.markdown("""

Submit to Official Leaderboard

Happy with your score? Submit your model to appear on the official leaderboard. Fill out the form below with your model details and results.

""", unsafe_allow_html=True) st.link_button("🗳️ Submit Model", "https://docs.google.com/forms/d/e/1FAIpQLSfcAQ6JW7eey-8OFSAz2ea_StCezxJK1dt6mjW_wR-9jCHnXg/viewform?usp=dialog", width="stretch")