from src.score_calculation.score import score_predictions
import ast
from datasets import load_dataset
from huggingface_hub import login
import multiprocessing
import numpy as np
import streamlit as st
from streamlit_chunk_file_uploader import uploader
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from io import StringIO
import json
import os
RESULTS_DIR = "results/"
# Page config
st.set_page_config(
page_title="NaviTrace Leaderboard",
layout="centered",
initial_sidebar_state="collapsed"
)
# Custom CSS for Nerfies-style design
st.markdown("""
""", unsafe_allow_html=True)
def load_data():
"""Load all result files as one data frame"""
try:
# Load all results files
all_dfs = []
for file_path in Path(RESULTS_DIR).glob('*.tsv'):
df = pd.read_csv(file_path, sep='\t')
model_name = file_path.stem.replace('_', ' ')
df["model"] = model_name
all_dfs.append(df)
# Concatenate all DataFrames into one
if all_dfs:
final_df = pd.concat(all_dfs, ignore_index=True)
return final_df
except Exception as e:
st.error(f"Error loading data: {str(e)}")
return None
def calculate_score(results_df):
"""Calculate score using private test split ground truth."""
try:
# Access to private dataset with test labels
login(token=os.environ.get("HF_TOKEN"))
dataset = load_dataset(os.environ.get("HF_DATASET_ID"), split="test")
# Calculate score
return score_predictions(results_df, dataset)
except Exception as e:
st.error(f"Error calculating score")
return None
def validate_tsv_format(uploaded_file):
"""Validate that the uploaded TSV has the correct format"""
try:
df = pd.read_csv(uploaded_file, sep='\t')
# Check for required columns, data types, etc.
required_cols = ["sample_id", "embodiment", "category", "prediction"]
if not all(col in df.columns for col in required_cols):
return False, f"Missing required columns. Expected: {required_cols}"
return True, df
except Exception as e:
return False, f"Error reading file: {str(e)}"
@st.cache_data
def convert_df_to_tsv(df):
return df.to_csv(sep='\t', index=False).encode('utf-8')
def create_bar_chart(df, view_type):
"""Create interactive bar chart based on view type"""
# Copy df
df_fig = df.copy()
df_fig = df_fig[df_fig["score"] != np.inf]
# Split too long names
model_renaming_map = {
"Qwen 3 VL 235b Thinking": "Qwen 3 VL 235b
Thinking",
}
df_fig["model"] = df_fig["model"].map(model_renaming_map).fillna(df_fig["model"])
if view_type == "Total Score":
# Calculate mean score per model
df_fig = df_fig.groupby("model")[["score"]].mean().reset_index()
# Sort the results from best to worst
df_fig = df_fig.sort_values(by="score", ascending=False)
# Create the Plotly figure
fig = px.bar(
df_fig,
x="model",
y="score",
color="score",
color_continuous_scale=px.colors.diverging.RdYlBu,
orientation="v",
)
max_score = df_fig["score"].max()
min_score = df_fig["score"].min()
fig.update_layout(
xaxis=dict(
title=dict(
text="Model",
standoff=25,
),
tickangle=-45,
),
yaxis=dict(
title_text="Score",
range=[min_score * 1.25, max_score * 1.25]
),
title_text="",
font=dict(size=15),
bargap=0.2,
height=600,
showlegend=False,
margin=dict(
l=60, # Left
r=0, # Right
b=95, # Bottom
t=80, # Top
pad=0 # Padding
),
)
# Remove the color legend from the chart.
fig.update_coloraxes(showscale=False)
# Add annotations to show the exact score on each bar.
fig.update_traces(
texttemplate="%{y:.0f}",
textposition="outside"
)
elif view_type == "Per Embodiment":
# Calculate the model order
df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index()
model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist()
# Calculate mean score per model and embodiment
df_fig = df_fig.groupby(["model", "embodiment"])[["score"]].mean().reset_index()
# Convert the "model" column to a categorical type with the sorted order
df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True)
# Sort the DataFrame based on the new categorical order
df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False])
# Create the Plotly figure
fig = px.bar(
df_fig,
x="model",
y="score",
color="embodiment",
color_discrete_sequence=px.colors.qualitative.Plotly,
orientation="v",
)
max_score = df_fig["score"].max()
min_score = df_fig["score"].min()
fig.update_layout(
xaxis=dict(
title=dict(
text="Model",
standoff=25,
),
tickangle=-45,
),
yaxis=dict(
title_text="Score",
range=[min_score * 1.25, max_score * 1.25]
),
title_text="",
font=dict(size=15),
bargap=0.1,
barmode="group",
height=600,
margin=dict(
l=60, # Left
r=0, # Right
b=95, # Bottom
t=80, # Top
pad=0 # Padding
),
showlegend=True,
legend=dict(
orientation="h",
x=0.5,
y=1.1,
xanchor="center",
yanchor="top",
borderwidth=0,
itemclick="toggle",
itemdoubleclick="toggleothers",
title=dict(
text="Embodiments",
side="top center"
)
),
uniformtext_minsize=10,
uniformtext_mode="show",
)
# Remove the color legend from the chart.
fig.update_coloraxes(showscale=False)
else: # Per Category
# Calculate the model order
df_model_order = df_fig.groupby("model")[["score"]].mean().reset_index()
model_order = df_model_order.sort_values(by="score", ascending=True)["model"].tolist()
# Calculate mean score per model and embodiment
df_fig["category"] = df_fig["category"].apply(ast.literal_eval)
df_fig = df_fig.explode("category")
df_fig = df_fig.groupby(["model", "category"])[["score"]].mean().reset_index()
# Convert the "model" column to a categorical type with the sorted order
df_fig["model"] = pd.Categorical(df_fig["model"], categories=model_order, ordered=True)
# Sort the DataFrame based on the new categorical order
df_fig = df_fig.sort_values(by=["model", "score"], ascending=[False, False])
# Create the Plotly figure
fig = px.bar(
df_fig,
x="model",
y="score",
color="category",
color_discrete_sequence=px.colors.qualitative.Plotly[::-1],
orientation="v",
)
max_score = df_fig["score"].max()
min_score = df_fig["score"].min()
fig.update_layout(
xaxis=dict(
title=dict(
text="Model",
standoff=25,
),
tickangle=-45,
),
yaxis=dict(
title_text="Score",
range=[min_score * 1.25, max_score * 1.25]
),
title_text="",
font=dict(size=15),
bargap=0.1,
barmode="group",
height=600,
margin=dict(
l=60, # Left
r=0, # Right
b=95, # Bottom
t=80, # Top
pad=0 # Padding
),
showlegend=True,
legend=dict(
orientation="h",
x=0.5,
y=1.1,
xanchor="center",
yanchor="top",
borderwidth=0,
itemclick="toggle",
itemdoubleclick="toggleothers",
title=dict(
text="Categories",
side="top center"
)
),
uniformtext_minsize=10,
uniformtext_mode="show",
)
# Remove the color legend from the chart.
fig.update_coloraxes(showscale=False)
return fig
def create_summary_table(df):
# Copy df
df_table = df.copy()
df_table = df_table[df_table["score"] != np.inf]
# Calculate total score per model
df_total = df_table.groupby("model")[["score"]].mean().reset_index()
df_total.columns = ["model", "Total Score"]
# Calculate scores per embodiment
df_embodiment = df_table.groupby(["model", "embodiment"])[["score"]].mean().reset_index()
df_embodiment_pivot = df_embodiment.pivot(index="model", columns="embodiment", values="score")
df_embodiment_pivot.columns = [f"{col}" for col in df_embodiment_pivot.columns]
# Calculate scores per category
df_category = df_table.copy()
df_category["category"] = df_category["category"].apply(ast.literal_eval)
df_category = df_category.explode("category")
df_category = df_category.groupby(["model", "category"])[["score"]].mean().reset_index()
df_category_pivot = df_category.pivot(index="model", columns="category", values="score")
df_category_pivot.columns = [f"{col}" for col in df_category_pivot.columns]
# Combine all tables
df_summary = df_total.set_index("model")
df_summary = df_summary.join(df_embodiment_pivot)
df_summary = df_summary.join(df_category_pivot)
# Sort by total score
df_summary = df_summary.sort_values(by="Total Score", ascending=False)
# Reset index to make model a column again
df_summary = df_summary.reset_index()
return df_summary
def main():
# Header
st.markdown("""