|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import plotly.graph_objects as go |
|
|
import plotly.express as px |
|
|
from io import StringIO |
|
|
import json |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="NaviTrace Leaderboard", |
|
|
page_icon="logo.svg", |
|
|
layout="centered", |
|
|
initial_sidebar_state="collapsed" |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
/* Import Font Awesome */ |
|
|
@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css'); |
|
|
|
|
|
/* Limit page width */ |
|
|
.main .block-container { |
|
|
max-width: 900px; |
|
|
padding-top: 3rem; |
|
|
} |
|
|
|
|
|
/* Main title styling */ |
|
|
.main-title { |
|
|
text-align: center; |
|
|
font-size: 3rem; |
|
|
font-weight: 700; |
|
|
margin-top: 1rem; |
|
|
margin-bottom: 2rem; |
|
|
color: #333; |
|
|
} |
|
|
|
|
|
/* Button links container - Nerfies style */ |
|
|
.button-links { |
|
|
display: flex; |
|
|
justify-content: center; |
|
|
gap: 1rem; |
|
|
margin-bottom: 3rem; |
|
|
flex-wrap: wrap; |
|
|
} |
|
|
|
|
|
.button-link { |
|
|
display: inline-flex; |
|
|
align-items: center; |
|
|
gap: 0.5rem; |
|
|
padding: 0.6rem 1.5rem; |
|
|
background-color: #f8f9fa; |
|
|
border: 1px solid #dee2e6; |
|
|
border-radius: 50px; |
|
|
text-decoration: none; |
|
|
color: #333; |
|
|
font-weight: 500; |
|
|
transition: all 0.3s ease; |
|
|
font-size: 0.95rem; |
|
|
} |
|
|
|
|
|
.button-link:hover { |
|
|
background-color: #e9ecef; |
|
|
transform: translateY(-2px); |
|
|
box-shadow: 0 4px 8px rgba(0,0,0,0.1); |
|
|
color: #333; |
|
|
text-decoration: none; |
|
|
} |
|
|
|
|
|
.button-link i { |
|
|
font-size: 1rem; |
|
|
} |
|
|
|
|
|
/* Section headers */ |
|
|
.section-header { |
|
|
font-size: 1.8rem; |
|
|
font-weight: 600; |
|
|
margin-top: 3rem; |
|
|
margin-bottom: 1.5rem; |
|
|
color: #333; |
|
|
} |
|
|
|
|
|
/* Instructions styling */ |
|
|
.instruction-item { |
|
|
display: flex; |
|
|
gap: 1.5rem; |
|
|
margin: 2rem 0; |
|
|
align-items: flex-start; |
|
|
} |
|
|
|
|
|
.instruction-number { |
|
|
flex-shrink: 0; |
|
|
width: 40px; |
|
|
height: 40px; |
|
|
border-radius: 50%; |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
color: white; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
font-weight: 700; |
|
|
font-size: 1.2rem; |
|
|
} |
|
|
|
|
|
.instruction-content { |
|
|
flex-grow: 1; |
|
|
padding-top: 0.3rem; |
|
|
} |
|
|
|
|
|
.instruction-title { |
|
|
font-size: 1.1rem; |
|
|
font-weight: 600; |
|
|
margin-bottom: 0.5rem; |
|
|
color: #333; |
|
|
} |
|
|
|
|
|
.instruction-desc { |
|
|
color: #666; |
|
|
line-height: 1.6; |
|
|
} |
|
|
|
|
|
/* Streamlit button styling */ |
|
|
.stButton>button { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
color: white; |
|
|
font-weight: 600; |
|
|
border: none; |
|
|
padding: 0.5rem 2rem; |
|
|
border-radius: 6px; |
|
|
transition: transform 0.2s; |
|
|
} |
|
|
|
|
|
.stButton>button:hover { |
|
|
transform: translateY(-2px); |
|
|
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4); |
|
|
} |
|
|
|
|
|
/* Hide streamlit branding */ |
|
|
#MainMenu {visibility: hidden;} |
|
|
footer {visibility: hidden;} |
|
|
|
|
|
/* Expander styling */ |
|
|
.streamlit-expanderHeader { |
|
|
font-size: 1.1rem; |
|
|
font-weight: 600; |
|
|
} |
|
|
|
|
|
/* File uploader */ |
|
|
[data-testid="stFileUploader"] { |
|
|
margin: 1rem 0; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
def load_sample_data(): |
|
|
return pd.DataFrame({ |
|
|
'Model': ['GPT-4', 'Claude-3.5-Sonnet', 'Gemini-Pro', 'Llama-3-70B', 'Mistral-Large'], |
|
|
'Total Score': [87.5, 85.2, 82.1, 78.3, 75.6], |
|
|
'Embodiment-A': [90.2, 87.5, 84.3, 80.1, 77.8], |
|
|
'Embodiment-B': [85.8, 84.1, 81.2, 77.9, 74.5], |
|
|
'Embodiment-C': [86.5, 84.0, 80.8, 76.9, 74.5], |
|
|
'Category-Spatial': [88.9, 86.7, 83.5, 79.8, 76.9], |
|
|
'Category-Temporal': [86.3, 84.2, 81.0, 77.5, 75.1], |
|
|
'Category-Object': [87.3, 84.7, 81.8, 77.6, 74.8], |
|
|
}) |
|
|
|
|
|
def calculate_score_backend(results_df): |
|
|
""" |
|
|
Calculate score using private test split ground truth. |
|
|
This function should: |
|
|
1. Load the private test split ground truth (not exposed to users) |
|
|
2. Compare uploaded predictions with ground truth |
|
|
3. Calculate metrics per embodiment and category |
|
|
4. Return detailed scores |
|
|
|
|
|
Args: |
|
|
results_df: DataFrame with columns ['sample_id', 'prediction', ...] |
|
|
|
|
|
Returns: |
|
|
dict: Scores breakdown or None if error |
|
|
""" |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scores = { |
|
|
'Total Score': 85.0, |
|
|
'Embodiment-A': 87.0, |
|
|
'Embodiment-B': 84.0, |
|
|
'Embodiment-C': 84.0, |
|
|
'Category-Spatial': 86.0, |
|
|
'Category-Temporal': 85.0, |
|
|
'Category-Object': 84.0, |
|
|
} |
|
|
return scores |
|
|
except Exception as e: |
|
|
st.error(f"Error calculating score: {str(e)}") |
|
|
return None |
|
|
|
|
|
def validate_tsv_format(uploaded_file): |
|
|
"""Validate that the uploaded TSV has the correct format""" |
|
|
try: |
|
|
df = pd.read_csv(uploaded_file, sep='\t') |
|
|
|
|
|
|
|
|
required_cols = ['sample_id', 'prediction'] |
|
|
if not all(col in df.columns for col in required_cols): |
|
|
return False, f"Missing required columns. Expected: {required_cols}" |
|
|
return True, df |
|
|
except Exception as e: |
|
|
return False, f"Error reading file: {str(e)}" |
|
|
|
|
|
def create_bar_chart(df, view_type): |
|
|
"""Create interactive bar chart based on view type""" |
|
|
if view_type == "Total Score": |
|
|
fig = go.Figure(data=[ |
|
|
go.Bar( |
|
|
x=df['Model'], |
|
|
y=df['Total Score'], |
|
|
marker_color=px.colors.sequential.Purples_r, |
|
|
text=df['Total Score'].round(1), |
|
|
textposition='outside', |
|
|
) |
|
|
]) |
|
|
fig.update_layout( |
|
|
title="Model Performance - Total Score", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="Score", |
|
|
yaxis_range=[0, 100], |
|
|
height=500, |
|
|
) |
|
|
|
|
|
elif view_type == "Per Embodiment": |
|
|
embodiment_cols = [col for col in df.columns if col.startswith('Embodiment-')] |
|
|
fig = go.Figure() |
|
|
for col in embodiment_cols: |
|
|
fig.add_trace(go.Bar( |
|
|
name=col.replace('Embodiment-', ''), |
|
|
x=df['Model'], |
|
|
y=df[col], |
|
|
text=df[col].round(1), |
|
|
textposition='outside', |
|
|
)) |
|
|
fig.update_layout( |
|
|
title="Model Performance - Per Embodiment", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="Score", |
|
|
yaxis_range=[0, 100], |
|
|
barmode='group', |
|
|
height=500, |
|
|
) |
|
|
|
|
|
else: |
|
|
category_cols = [col for col in df.columns if col.startswith('Category-')] |
|
|
fig = go.Figure() |
|
|
for col in category_cols: |
|
|
fig.add_trace(go.Bar( |
|
|
name=col.replace('Category-', ''), |
|
|
x=df['Model'], |
|
|
y=df[col], |
|
|
text=df[col].round(1), |
|
|
textposition='outside', |
|
|
)) |
|
|
fig.update_layout( |
|
|
title="Model Performance - Per Category", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="Score", |
|
|
yaxis_range=[0, 100], |
|
|
barmode='group', |
|
|
height=500, |
|
|
) |
|
|
|
|
|
|
|
|
fig.update_layout( |
|
|
plot_bgcolor='rgba(0,0,0,0)', |
|
|
paper_bgcolor='rgba(0,0,0,0)', |
|
|
font=dict(size=12), |
|
|
showlegend=(view_type != "Total Score"), |
|
|
margin=dict(t=80, b=60, l=60, r=60), |
|
|
) |
|
|
fig.update_xaxes(showgrid=False) |
|
|
fig.update_yaxes(showgrid=True, gridcolor='lightgray', gridwidth=0.5) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
st.markdown('<h1 class="main-title">NaviTrace Leaderboard</h1>', unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="button-links"> |
|
|
<a href="https://your-paper-website.com" target="_blank" class="button-link"> |
|
|
<i class="fas fa-file-pdf"></i> Paper |
|
|
</a> |
|
|
<a href="https://huggingface.co/datasets/your-username/navitrace" target="_blank" class="button-link"> |
|
|
<i class="fas fa-database"></i> Dataset |
|
|
</a> |
|
|
<a href="https://github.com/your-username/navitrace" target="_blank" class="button-link"> |
|
|
<i class="fab fa-github"></i> Code |
|
|
</a> |
|
|
<a href="https://your-demo-link.com" target="_blank" class="button-link"> |
|
|
<i class="far fa-images"></i> Demo |
|
|
</a> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
df = load_sample_data() |
|
|
|
|
|
|
|
|
if 'user_results' in st.session_state: |
|
|
user_row = pd.DataFrame([st.session_state.user_results]) |
|
|
df = pd.concat([user_row, df], ignore_index=True) |
|
|
|
|
|
|
|
|
st.markdown('<div id="leaderboard"></div>', unsafe_allow_html=True) |
|
|
st.markdown('<h2 class="section-header">Leaderboard</h2>', unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
view_type = st.selectbox( |
|
|
"Select View", |
|
|
["Total Score", "Per Embodiment", "Per Category"], |
|
|
) |
|
|
|
|
|
|
|
|
fig = create_bar_chart(df, view_type) |
|
|
st.plotly_chart(fig, use_container_width=True, config={ |
|
|
'displayModeBar': True, |
|
|
'displaylogo': False, |
|
|
'toImageButtonOptions': { |
|
|
'format': 'png', |
|
|
'filename': 'navitrace_leaderboard', |
|
|
'height': 600, |
|
|
'width': 1200, |
|
|
'scale': 2 |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
with st.expander("View Detailed Scores"): |
|
|
st.dataframe(df.style.background_gradient(cmap='Purples', subset=df.columns[1:]), use_container_width=True) |
|
|
|
|
|
|
|
|
with st.expander("Embed Chart in Your Website"): |
|
|
html_str = fig.to_html(include_plotlyjs='cdn') |
|
|
st.code(html_str, language='html') |
|
|
st.download_button( |
|
|
label="Download HTML", |
|
|
data=html_str, |
|
|
file_name="navitrace_chart.html", |
|
|
mime="text/html" |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown('<h2 class="section-header">Evaluate Your Model</h2>', unsafe_allow_html=True) |
|
|
|
|
|
with st.expander("How to Test Your Model", expanded=False): |
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">1</div> |
|
|
<div class="instruction-content"> |
|
|
<div class="instruction-title">Run Evaluation</div> |
|
|
<div class="instruction-desc"> |
|
|
Download and run our evaluation notebook on your model. The notebook will generate a TSV file with your model's predictions on the test set. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.link_button("📓 Open Evaluation Notebook", "https://colab.research.google.com/your-notebook-link", use_container_width=False) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">2</div> |
|
|
<div class="instruction-content"> |
|
|
<div class="instruction-title">Upload Results</div> |
|
|
<div class="instruction-desc"> |
|
|
Upload the TSV file generated by the evaluation notebook. Your predictions will be evaluated against our private test set. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a TSV file", type=['tsv', 'txt']) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">3</div> |
|
|
<div class="instruction-content"> |
|
|
<div class="instruction-title">Calculate Score</div> |
|
|
<div class="instruction-desc"> |
|
|
Click the button below to evaluate your predictions. Scores are calculated using the private test set ground truth. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
if st.button("Calculate Score", use_container_width=False): |
|
|
with st.spinner("Validating and calculating scores..."): |
|
|
|
|
|
is_valid, result = validate_tsv_format(uploaded_file) |
|
|
if is_valid: |
|
|
|
|
|
scores = calculate_score_backend(result) |
|
|
if scores is not None: |
|
|
st.success(f"✅ Score calculated successfully: **{scores['Total Score']:.1f}**") |
|
|
|
|
|
|
|
|
st.session_state.user_results = { |
|
|
'Model': 'Your Model', |
|
|
**scores |
|
|
} |
|
|
st.info("👆 Scroll up to see your model on the leaderboard!") |
|
|
st.rerun() |
|
|
else: |
|
|
st.error(f"❌ Invalid file format: {result}") |
|
|
else: |
|
|
st.info("👆 Upload a TSV file to calculate your score") |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<div class="instruction-item"> |
|
|
<div class="instruction-number">4</div> |
|
|
<div class="instruction-content"> |
|
|
<div class="instruction-title">Submit to Official Leaderboard</div> |
|
|
<div class="instruction-desc"> |
|
|
Happy with your results? Submit your model to appear on the official leaderboard. |
|
|
All submissions undergo manual verification to ensure quality and prevent gaming. |
|
|
Fill out the form below with your model details and results. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.link_button("Submit Model (Google Form)", "https://forms.gle/your-google-form-link", use_container_width=False) |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown(""" |
|
|
<div style="text-align: center; color: #666; padding: 2rem 0;"> |
|
|
<p>NaviTrace Benchmark | <a href="mailto:your-email@domain.com" style="color: #667eea;">Contact</a></p> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |