import streamlit as st import pandas as pd import plotly.graph_objects as go import plotly.express as px from io import StringIO import json # Page config st.set_page_config( page_title="NaviTrace Leaderboard", layout="centered", initial_sidebar_state="collapsed" ) # Custom CSS for Nerfies-style design st.markdown(""" """, unsafe_allow_html=True) # Sample data - Replace with your actual data def load_data(): return pd.DataFrame({ 'Model': ['GPT-4', 'Claude-3.5-Sonnet', 'Gemini-Pro', 'Llama-3-70B', 'Mistral-Large'], 'Total Score': [87.5, 85.2, 82.1, 78.3, 75.6], 'Embodiment-A': [90.2, 87.5, 84.3, 80.1, 77.8], 'Embodiment-B': [85.8, 84.1, 81.2, 77.9, 74.5], 'Embodiment-C': [86.5, 84.0, 80.8, 76.9, 74.5], 'Category-Spatial': [88.9, 86.7, 83.5, 79.8, 76.9], 'Category-Temporal': [86.3, 84.2, 81.0, 77.5, 75.1], 'Category-Object': [87.3, 84.7, 81.8, 77.6, 74.8], }) def calculate_score(results_df): """ Calculate score using private test split ground truth. This function should: 1. Load the private test split ground truth (not exposed to users) 2. Compare uploaded predictions with ground truth 3. Calculate metrics per embodiment and category 4. Return detailed scores Args: results_df: DataFrame with columns ['sample_id', 'prediction', ...] Returns: dict: Scores breakdown or None if error """ try: # TODO: Implement your scoring logic here # Example structure: # ground_truth = load_private_test_split() # From secure location # scores = evaluate_predictions(results_df, ground_truth) # Placeholder - replace with actual calculation scores = { 'Total Score': 85.0, 'Embodiment-A': 87.0, 'Embodiment-B': 84.0, 'Embodiment-C': 84.0, 'Category-Spatial': 86.0, 'Category-Temporal': 85.0, 'Category-Object': 84.0, } return scores except Exception as e: st.error(f"Error calculating score: {str(e)}") return None def validate_tsv_format(uploaded_file): """Validate that the uploaded TSV has the correct format""" try: df = pd.read_csv(uploaded_file, sep='\t') # TODO: Add your specific validation logic # Check for required columns, data types, etc. required_cols = ['sample_id', 'prediction'] # Adjust as needed if not all(col in df.columns for col in required_cols): return False, f"Missing required columns. Expected: {required_cols}" return True, df except Exception as e: return False, f"Error reading file: {str(e)}" def create_bar_chart(df, view_type): """Create interactive bar chart based on view type""" if view_type == "Total Score": fig = go.Figure(data=[ go.Bar( x=df['Model'], y=df['Total Score'], marker_color=px.colors.sequential.Purples_r, text=df['Total Score'].round(1), textposition='outside', ) ]) fig.update_layout( title="Model Performance - Total Score", xaxis_title="Model", yaxis_title="Score", yaxis_range=[0, 100], height=500, ) elif view_type == "Per Embodiment": embodiment_cols = [col for col in df.columns if col.startswith('Embodiment-')] fig = go.Figure() for col in embodiment_cols: fig.add_trace(go.Bar( name=col.replace('Embodiment-', ''), x=df['Model'], y=df[col], text=df[col].round(1), textposition='outside', )) fig.update_layout( title="Model Performance - Per Embodiment", xaxis_title="Model", yaxis_title="Score", yaxis_range=[0, 100], barmode='group', height=500, ) else: # Per Category category_cols = [col for col in df.columns if col.startswith('Category-')] fig = go.Figure() for col in category_cols: fig.add_trace(go.Bar( name=col.replace('Category-', ''), x=df['Model'], y=df[col], text=df[col].round(1), textposition='outside', )) fig.update_layout( title="Model Performance - Per Category", xaxis_title="Model", yaxis_title="Score", yaxis_range=[0, 100], barmode='group', height=500, ) # Common styling fig.update_layout( plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font=dict(size=12), showlegend=(view_type != "Total Score"), margin=dict(t=80, b=60, l=60, r=60), ) fig.update_xaxes(showgrid=False) fig.update_yaxes(showgrid=True, gridcolor='lightgray', gridwidth=0.5) return fig # TODO remove # Serve only the chart as JSON if parameter "only_chart" is set # # E.g. https://huggingface.co/spaces/leggedrobotics/navitrace_leaderboard/?only_chart=total_score # params = st.query_params # if "only_chart" in params and params["only_chart"] in ["total_score", "per_embodiment", "per_category"]: # if params["only_chart"] == "total_score": # view_type = "Total Score" # elif params["only_chart"] == "per_embodiment": # view_type = "Per Embodiment" # elif params["only_chart"] == "per_category": # view_type = "Per Category" # # Create chart # df = load_data() # fig = create_bar_chart(df, view_type) # # Only output JSON # st.write(fig.to_json()) # st.stop() # Main content st.title("NaviTrace Leaderboard") # Links st.markdown(""" """, unsafe_allow_html=True) # Load data df = load_data() # Add user's model if it exists in session state if 'user_results' in st.session_state: user_row = pd.DataFrame([st.session_state.user_results]) df = pd.concat([user_row, df], ignore_index=True) # View selector view_type = st.selectbox( "Select View", ["Total Score", "Per Embodiment", "Per Category"], ) # Display chart fig = create_bar_chart(df, view_type) st.plotly_chart(fig, use_container_width=True, config={ 'displayModeBar': True, 'displaylogo': False, 'toImageButtonOptions': { 'format': 'png', 'filename': 'navitrace_leaderboard', 'height': 600, 'width': 1200, 'scale': 2 } }) # Detailed table with st.expander("View Detailed Scores"): st.dataframe(df.style.background_gradient(cmap='Purples', subset=df.columns[1:]), use_container_width=True) with st.expander("How to Test Your Model", expanded=True): # Step 1 st.markdown("""
1
Run Evaluation
Download and run our evaluation notebook adjusted to your model. The notebook will generate a TSV file with your model's predictions on the test set.
""", unsafe_allow_html=True) st.link_button("📓 Open Evaluation Notebook", "https://colab.research.google.com/your-notebook-link", use_container_width=True) # Step 2 st.markdown("""
2
Upload Results
Upload the TSV file generated by the evaluation notebook.
""", unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload your TSV file with results", type=['tsv', 'txt'], label_visibility="collapsed") # Step 3 st.markdown("""
3
Calculate Score
Click the button below to evaluate your predictions. Scores are calculated using hidden test set ground-truths.
""", unsafe_allow_html=True) if uploaded_file is not None: if st.button("🧮 Calculate Score", use_container_width=True): with st.spinner("Validating and calculating scores..."): # Validate format is_valid, result = validate_tsv_format(uploaded_file) if is_valid: # Calculate score using hidden ground-truth scores = calculate_score(result) if scores is not None: st.success(f"✅ Score calculated successfully: **{scores['Total Score']:.1f}**") # Store in session state st.session_state.user_results = { 'Model': 'Your Model', **scores } st.info("👆 Scroll up to see your model on the leaderboard!") st.rerun() else: st.error(f"❌ Invalid file format: {result}") else: st.info("👆 Upload a TSV file to calculate your score") # Step 4 st.markdown("""
4
Submit to Official Leaderboard
Happy with your score? Submit your model to appear on the official leaderboard. Fill out the form below with your model details and results.
""", unsafe_allow_html=True) st.link_button("🗳️ Submit Model", "https://forms.gle/your-google-form-link", use_container_width=True)