Spaces:

Snowflake
/

MADQA-Leaderboard

Running

App Files Files

MADQA-Leaderboard / app.py

Borchmann

Add effort validation for agentic submissions and uniform-effort display

dfedb16 18 days ago

raw

history blame

128 kB

	"""
	MADQA Leaderboard - Streamlit Version

	Benchmark for evaluating AI systems on document collection question answering.
	Based on the paper: "Strategic Navigation or Stochastic Search?
	How Agents and Humans Reason Over Document Collections"

	Color palette: Snowflake colors
	- SNOWFLAKE BLUE: #29B5E8
	- MID-BLUE: #11567F
	- MIDNIGHT: #000000
	- MEDIUM GRAY: #5B5B5B
	- STAR BLUE: #75CDD7
	- VALENCIA ORANGE: #FF9F36
	- FIRST LIGHT: #D45B90
	- PURPLE MOON: #7254A3
	"""

	import base64
	import json
	import os
	import secrets
	import shutil
	import sys
	from collections import defaultdict
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from datetime import datetime, timezone
	from pathlib import Path
	from urllib.parse import urlencode, quote, unquote

	# Parallelization config for LLM evaluation
	MAX_EVAL_WORKERS = 24

	import pandas as pd
	import plotly.graph_objects as go
	import requests
	import streamlit as st
	from huggingface_hub import snapshot_download, HfApi, hf_hub_download

	# Add eval module to path
	sys.path.insert(0, str(Path(__file__).parent / "eval"))
	try:
	from metrics import (
	anls_star,
	anls_star_llm,
	aggregate_anls_star_llm,
	standard_error,
	confidence_interval,
	citation_f1,
	kuiper_statistic,
	get_effort_value,
	LLM_JUDGE_SPECIFICITY,
	LLM_JUDGE_SENSITIVITY
	)
	from datasets import load_dataset
	EVAL_AVAILABLE = True
	except ImportError:
	EVAL_AVAILABLE = False
	# Fallback values for constants
	LLM_JUDGE_SPECIFICITY = 1.0
	LLM_JUDGE_SENSITIVITY = 0.98

	# Page configuration
	st.set_page_config(
	page_title="MADQA Leaderboard",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="collapsed",
	)

	# HuggingFace Hub configuration
	TOKEN = os.environ.get("HF_TOKEN")
	QUEUE_REPO = "agentic-document-ai/backend-requests"
	RESULTS_REPO = "agentic-document-ai/backend-results"
	CACHE_PATH = os.getenv("HF_HOME", ".")

	# Submission rate limiting
	SUBMISSION_LIMITS_FILE = "submission_limits.json"
	SUBMISSION_LIMIT_HOURS = float(os.environ.get("SUBMISSION_LIMIT_HOURS", 24)) # Configurable, default 24 hours
	NEWS_FILE = "news.json"
	NEWS_MAX_DISPLAY = 5


	def get_submission_limits() -> dict:
	"""Download submission limits from HF Hub."""
	try:
	# Try to download the limits file
	file_path = hf_hub_download(
	repo_id=RESULTS_REPO,
	filename=SUBMISSION_LIMITS_FILE,
	repo_type="dataset",
	token=TOKEN,
	)
	with open(file_path) as f:
	return json.load(f)
	except Exception:
	return {} # File doesn't exist yet


	def can_user_submit(username: str) -> tuple[bool, str, float]:
	"""Check if user can submit based on rate limit.

	Returns: (can_submit, message, hours_remaining)
	"""
	limits = get_submission_limits()

	if username not in limits:
	return True, "", 0

	last_submission_str = limits[username]
	last_submission = datetime.fromisoformat(last_submission_str)

	now = datetime.now(timezone.utc)
	time_since = now - last_submission
	hours_since = time_since.total_seconds() / 3600

	if hours_since < SUBMISSION_LIMIT_HOURS:
	hours_remaining = SUBMISSION_LIMIT_HOURS - hours_since
	hours = int(hours_remaining)
	minutes = int((hours_remaining - hours) * 60)
	return False, f"Please wait {hours}h {minutes}m before your next test set submission.", hours_remaining

	return True, "", 0


	def record_submission(username: str):
	"""Record a new submission timestamp for the user."""
	import tempfile

	# Get current limits (fresh, not cached)
	limits = get_submission_limits()

	# Update with new timestamp
	limits[username] = datetime.now(timezone.utc).isoformat()

	# Upload updated file
	try:
	api = HfApi(token=TOKEN)

	# Create temp file with updated limits
	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
	json.dump(limits, f, indent=2)
	temp_path = f.name

	api.upload_file(
	path_or_fileobj=temp_path,
	path_in_repo=SUBMISSION_LIMITS_FILE,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	)

	os.unlink(temp_path) # Clean up

	except Exception as e:
	st.warning(f"Could not record submission time: {e}")


	def get_news() -> list:
	"""Load news items from HF Hub."""
	try:
	file_path = hf_hub_download(
	repo_id=RESULTS_REPO,
	filename=NEWS_FILE,
	repo_type="dataset",
	token=TOKEN,
	)
	with open(file_path) as f:
	news = json.load(f)
	# Sort by date descending
	news.sort(key=lambda x: x.get('date', ''), reverse=True)
	return news
	except Exception:
	# Return default news if file doesn't exist
	return [
	{"date": "2025-01-04", "text": "Leaderboard launched! Submit your results to appear on the board."}
	]


	def save_news(news: list) -> tuple[bool, str]:
	"""Save news items to HF Hub."""
	import tempfile

	try:
	# Sort by date descending before saving
	news.sort(key=lambda x: x.get('date', ''), reverse=True)

	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
	json.dump(news, f, indent=2)
	temp_path = f.name

	api = HfApi(token=TOKEN)
	api.upload_file(
	path_or_fileobj=temp_path,
	path_in_repo=NEWS_FILE,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message="Update news"
	)

	os.unlink(temp_path)
	return True, "News updated successfully"
	except Exception as e:
	return False, f"Failed to save news: {str(e)}"


	def get_oauth_config() -> dict \| None:
	"""Get HuggingFace OAuth configuration from environment variables.

	These are automatically set by HuggingFace Spaces when hf_oauth: true is in README.md.
	See: https://huggingface.co/docs/hub/en/spaces-oauth
	"""
	client_id = os.environ.get("OAUTH_CLIENT_ID")
	client_secret = os.environ.get("OAUTH_CLIENT_SECRET")

	if client_id and client_secret:
	return {
	"client_id": client_id,
	"client_secret": client_secret,
	"scopes": os.environ.get("OAUTH_SCOPES", "openid profile"),
	"provider_url": os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co"),
	}
	return None


	def get_hf_user() -> dict \| None:
	"""Get the logged-in HuggingFace user info from OAuth.

	Returns dict with 'username', 'name', 'picture' if logged in, None otherwise.
	Works on HuggingFace Spaces with hf_oauth: true in README.md

	For local testing, set environment variable:
	TEST_HF_USER=your_username
	"""
	# Check for test user (local development)
	test_user = os.environ.get("TEST_HF_USER")
	if test_user:
	return {
	'username': test_user,
	'name': test_user,
	'picture': '',
	}

	# Check session state for logged in user (from OAuth callback)
	if 'hf_user' in st.session_state and st.session_state.hf_user:
	return st.session_state.hf_user

	return None


	def handle_oauth_callback():
	"""Handle OAuth callback from HuggingFace.

	After user authorizes, HF redirects back with 'code' and 'state' query params.
	We exchange the code for tokens and store user info in session state.

	Note: We don't strictly validate state because Streamlit session state is lost
	during the redirect flow. The OAuth is still secure because:
	1. The code can only be used once
	2. The code is tied to our client_id
	3. We're on HTTPS in production
	"""
	try:
	query_params = st.query_params
	except Exception:
	# SessionInfo not yet initialized - skip OAuth handling on this run
	return False

	# Check if this is an OAuth callback
	code = query_params.get("code")

	if not code:
	return False

	# If user is already logged in, just clear the query params
	try:
	if 'hf_user' in st.session_state and st.session_state.hf_user:
	st.query_params.clear()
	return True
	except Exception:
	pass

	oauth_config = get_oauth_config()
	if not oauth_config:
	st.query_params.clear()
	return False

	# Get redirect URI - must match what HuggingFace expects (.hf.space domain)
	space_host = os.environ.get("SPACE_HOST", "")
	if space_host:
	redirect_uri = f"https://{space_host}"
	else:
	redirect_uri = "http://localhost:8501"

	# Exchange code for tokens
	token_url = f"{oauth_config['provider_url']}/oauth/token"

	try:
	# Prepare auth header
	credentials = f"{oauth_config['client_id']}:{oauth_config['client_secret']}"
	auth_header = base64.b64encode(credentials.encode()).decode()

	response = requests.post(
	token_url,
	data={
	"grant_type": "authorization_code",
	"code": code,
	"redirect_uri": redirect_uri,
	"client_id": oauth_config["client_id"],
	},
	headers={
	"Authorization": f"Basic {auth_header}",
	"Content-Type": "application/x-www-form-urlencoded",
	},
	timeout=10,
	)

	if response.status_code != 200:
	# Code might have been used already or expired - clear and let user retry
	st.query_params.clear()
	return False

	tokens = response.json()
	access_token = tokens.get("access_token")

	# Get user info
	userinfo_url = f"{oauth_config['provider_url']}/oauth/userinfo"
	userinfo_response = requests.get(
	userinfo_url,
	headers={"Authorization": f"Bearer {access_token}"},
	timeout=10,
	)

	if userinfo_response.status_code == 200:
	userinfo = userinfo_response.json()
	st.session_state.hf_user = {
	'username': userinfo.get('preferred_username', userinfo.get('name', '')),
	'name': userinfo.get('name', ''),
	'picture': userinfo.get('picture', ''),
	}
	# Clean up query params
	st.query_params.clear()
	return True

	except Exception as e:
	# Silent failure - user can retry login
	pass

	st.query_params.clear()
	return False


	def is_running_on_hf_spaces() -> bool:
	"""Check if the app is running on HuggingFace Spaces."""
	return os.environ.get("SPACE_ID") is not None


	def get_login_url() -> str \| None:
	"""Generate the HuggingFace OAuth login URL."""
	oauth_config = get_oauth_config()
	if not oauth_config:
	return None

	# Get redirect URI - must use .hf.space domain (required by HuggingFace OAuth)
	space_host = os.environ.get("SPACE_HOST", "")
	if space_host:
	redirect_uri = f"https://{space_host}"
	else:
	redirect_uri = "http://localhost:8501"

	# Generate a random state (required by OAuth spec, but we can't validate it
	# reliably due to Streamlit session loss during redirect)
	state = secrets.token_urlsafe(16)

	# Build authorization URL
	params = {
	"client_id": oauth_config["client_id"],
	"redirect_uri": redirect_uri,
	"scope": oauth_config["scopes"],
	"state": state,
	"response_type": "code",
	}

	return f"{oauth_config['provider_url']}/oauth/authorize?{urlencode(params)}"


	def show_login_button():
	"""Show the HuggingFace login button."""
	login_url = get_login_url()

	if login_url:
	# Use custom HTML styled like Streamlit's default button, aligned left
	st.markdown(f'''
	<a href="{login_url}" target="_self" style="
	display: inline-flex;
	align-items: center;
	justify-content: center;
	padding: 0.25rem 0.75rem;
	background-color: transparent;
	color: inherit;
	border: 1px solid rgba(250, 250, 250, 0.2);
	border-radius: 0.5rem;
	text-decoration: none;
	font-size: 0.875rem;
	font-weight: 400;
	line-height: 1.6;
	cursor: pointer;
	transition: border-color 0.2s, background-color 0.2s;
	" onmouseover="this.style.borderColor='rgba(250,250,250,0.6)'; this.style.backgroundColor='rgba(250,250,250,0.05)';"
	onmouseout="this.style.borderColor='rgba(250,250,250,0.2)'; this.style.backgroundColor='transparent';">
	Sign in with Hugging Face
	</a>
	''', unsafe_allow_html=True)
	return True

	return False


	def logout():
	"""Log out the current user."""
	if 'hf_user' in st.session_state:
	del st.session_state.hf_user

	# Colors
	SNOWFLAKE_BLUE = "#29B5E8"
	MID_BLUE = "#11567F"
	VALENCIA_ORANGE = "#FF9F36"
	STAR_BLUE = "#75CDD7"
	FIRST_LIGHT = "#D45B90"
	PURPLE_MOON = "#7254A3"
	MEDIUM_GRAY = "#5B5B5B"

	# Available tags for filtering - can be extended
	AVAILABLE_TAGS = [
	"Agentic",
	"Conventional RAG",
	"Sparse Search Tool",
	"Semantic Search Tool",
	"Vision and Language",
	"Text-only",
	]

	# Tag colors for visual distinction (cycling through Snowflake secondary colors)
	TAG_COLORS = {
	"Agentic": SNOWFLAKE_BLUE,
	"Conventional RAG": STAR_BLUE,
	"Sparse Search Tool": VALENCIA_ORANGE,
	"Semantic Search Tool": FIRST_LIGHT,
	"Vision and Language": PURPLE_MOON,
	"Text-only": SNOWFLAKE_BLUE,
	}

	# Custom CSS following Snowflake Brand Color Guide
	# Primary: MID-BLUE (#11567F) for accents/sections, SNOWFLAKE BLUE (#29B5E8) sparingly
	# Use white text on dark backgrounds per accessibility guidelines
	st.markdown(f"""
	<style>
	/* Dark theme base - using near-black for good contrast */
	.stApp {{
	background-color: #0e1117;
	}}

	/* ===== TAB STYLING ===== */
	.stTabs [data-baseweb="tab-list"] {{
	gap: 8px;
	background-color: transparent;
	border-bottom: 2px solid {MID_BLUE};
	padding-bottom: 0;
	}}

	.stTabs [data-baseweb="tab"] {{
	height: 50px;
	padding: 0 28px;
	background-color: transparent !important;
	border-radius: 0;
	font-weight: 500;
	font-size: 18px;
	color: {MEDIUM_GRAY} !important;
	border-bottom: 3px solid transparent !important;
	margin-bottom: -2px;
	}}

	.stTabs [aria-selected="true"] {{
	background-color: transparent !important;
	color: {SNOWFLAKE_BLUE} !important;
	border-bottom: 3px solid {SNOWFLAKE_BLUE} !important;
	}}

	.stTabs [data-baseweb="tab"]:hover {{
	color: {SNOWFLAKE_BLUE} !important;
	}}

	/* Tab indicator overrides */
	.stTabs [data-baseweb="tab-highlight"],
	div[data-baseweb="tab-highlight"] {{
	background-color: {SNOWFLAKE_BLUE} !important;
	}}

	.stTabs [role="tablist"] > div:last-child {{
	background-color: {SNOWFLAKE_BLUE} !important;
	}}

	/* ===== CHECKBOX STYLING - Clean, no background highlight ===== */
	.stCheckbox {{
	background: transparent !important;
	}}

	.stCheckbox label {{
	background: transparent !important;
	color: white !important;
	}}

	.stCheckbox label span {{
	background: transparent !important;
	color: white !important;
	}}

	/* Remove any highlight/selection background from checkbox labels */
	.stCheckbox > label,
	.stCheckbox label > span,
	.stCheckbox label > div {{
	background-color: transparent !important;
	background: none !important;
	}}

	/* The checkbox box itself - unchecked */
	.stCheckbox [data-baseweb="checkbox"] > div:first-child {{
	border-color: {MEDIUM_GRAY} !important;
	background-color: transparent !important;
	border-width: 2px !important;
	}}

	/* Checkbox when checked - fill with blue */
	.stCheckbox [data-baseweb="checkbox"][aria-checked="true"] > div:first-child,
	[data-testid="stCheckbox"] [aria-checked="true"] > div:first-child {{
	background-color: {SNOWFLAKE_BLUE} !important;
	border-color: {SNOWFLAKE_BLUE} !important;
	}}

	/* Alternative selector for checked state */
	input[type="checkbox"]:checked + div {{
	background-color: {SNOWFLAKE_BLUE} !important;
	}}

	/* Checkmark icon - make it visible */
	.stCheckbox [data-baseweb="checkbox"] svg,
	[data-baseweb="checkbox"] svg {{
	color: white !important;
	stroke: white !important;
	fill: white !important;
	}}

	/* ===== BUTTON STYLING - MID-BLUE primary ===== */
	.stButton > button {{
	background-color: {MID_BLUE} !important;
	color: white !important;
	border: none !important;
	border-radius: 6px;
	font-weight: 500;
	padding: 0.5rem 1.5rem;
	transition: all 0.2s ease;
	}}

	.stButton > button:hover {{
	background-color: {SNOWFLAKE_BLUE} !important;
	}}

	.stButton > button:active, .stButton > button:focus {{
	background-color: {MID_BLUE} !important;
	box-shadow: 0 0 0 2px {SNOWFLAKE_BLUE} !important;
	}}

	/* Download button */
	.stDownloadButton > button {{
	background-color: {MID_BLUE} !important;
	color: white !important;
	border: none !important;
	}}

	.stDownloadButton > button:hover {{
	background-color: {SNOWFLAKE_BLUE} !important;
	}}

	/* ===== FORM ELEMENTS ===== */
	/* Text inputs */
	.stTextInput > div > div > input {{
	border-color: {MEDIUM_GRAY} !important;
	background-color: #1a1a2e !important;
	}}

	.stTextInput > div > div > input:focus {{
	border-color: {SNOWFLAKE_BLUE} !important;
	box-shadow: 0 0 0 1px {SNOWFLAKE_BLUE} !important;
	}}

	/* Select boxes */
	.stSelectbox [data-baseweb="select"] > div {{
	border-color: {MEDIUM_GRAY} !important;
	background-color: #1a1a2e !important;
	}}

	/* Multiselect chips */
	.stMultiSelect [data-baseweb="tag"] {{
	background-color: {MID_BLUE} !important;
	color: white !important;
	}}

	/* File uploader */
	[data-testid="stFileUploader"] {{
	border: 2px dashed {MEDIUM_GRAY} !important;
	border-radius: 12px;
	padding: 2rem 1.5rem !important;
	background-color: transparent !important;
	transition: all 0.2s ease;
	}}

	[data-testid="stFileUploader"]:hover {{
	border-color: {SNOWFLAKE_BLUE} !important;
	background-color: rgba(17, 86, 127, 0.08) !important;
	}}

	[data-testid="stFileUploaderDropzone"] {{
	background-color: transparent !important;
	}}

	[data-testid="stFileUploader"] section {{
	padding: 0 !important;
	}}

	[data-testid="stFileUploader"] section > div {{
	padding: 0.5rem 0 !important;
	}}

	/* ===== LINKS - Snowflake Blue for visibility ===== */
	/* Exclude link buttons from global link styling */
	a:not([data-testid="LinkButton"]):not([class="LinkButton"]) {{
	color: {SNOWFLAKE_BLUE} !important;
	text-decoration: none !important;
	}}

	a:not([data-testid="LinkButton"]):not([class="LinkButton"]):hover {{
	color: {STAR_BLUE} !important;
	text-decoration: underline !important;
	}}

	/* HuggingFace login button - style for st.link_button */
	[data-testid="stLinkButton"] a,
	[data-testid="stLinkButton"] a *,
	[data-testid="stLinkButton"] a p,
	[data-testid="stLinkButton"] a span {{
	background: linear-gradient(135deg, #FF9D00 0%, #FFD21E 100%) !important;
	color: #000000 !important;
	border: none !important;
	font-weight: 700 !important;
	text-decoration: none !important;
	}}

	[data-testid="stLinkButton"] a:hover,
	[data-testid="stLinkButton"] a:hover *,
	[data-testid="stLinkButton"] a:hover p,
	[data-testid="stLinkButton"] a:hover span {{
	background: linear-gradient(135deg, #FFD21E 0%, #FF9D00 100%) !important;
	color: #000000 !important;
	text-decoration: none !important;
	}}

	/* ===== SECTION HEADERS ===== */
	h3 {{
	color: white;
	}}

	/* ===== ALERTS/MESSAGES ===== */
	/* Base alert styling */
	[data-testid="stAlert"] > div {{
	border-radius: 8px !important;
	padding: 1rem !important;
	}}

	/* Info messages - Snowflake Blue */
	[data-testid="stAlert"][data-baseweb="notification"] {{
	background-color: rgba(41, 181, 232, 0.15) !important;
	border-left: 4px solid {SNOWFLAKE_BLUE} !important;
	border-radius: 8px !important;
	}}

	/* Target by icon type for more specific styling */
	.stAlert div[role="alert"] {{
	background-color: rgba(41, 181, 232, 0.15) !important;
	border-left: 4px solid {SNOWFLAKE_BLUE} !important;
	border-radius: 8px !important;
	padding: 1rem !important;
	}}

	/* Success - has checkmark icon */
	.stSuccess div[role="alert"],
	[data-testid="stAlert"]:has([data-testid="stIconSuccess"]) div[role="alert"] {{
	background-color: rgba(117, 205, 215, 0.15) !important;
	border-left: 4px solid {STAR_BLUE} !important;
	}}

	/* Warning - has warning icon */
	.stWarning div[role="alert"],
	[data-testid="stAlert"]:has([data-testid="stIconWarning"]) div[role="alert"] {{
	background-color: rgba(255, 159, 54, 0.15) !important;
	border-left: 4px solid {VALENCIA_ORANGE} !important;
	}}

	/* Error - has error icon */
	.stError div[role="alert"],
	[data-testid="stAlert"]:has([data-testid="stIconError"]) div[role="alert"] {{
	background-color: rgba(212, 91, 144, 0.15) !important;
	border-left: 4px solid {FIRST_LIGHT} !important;
	}}

	/* Alert text colors */
	[data-testid="stAlert"] p,
	.stAlert p {{
	color: rgba(255, 255, 255, 0.9) !important;
	}}

	/* ===== SPINNER ===== */
	.stSpinner > div {{
	border-top-color: {SNOWFLAKE_BLUE} !important;
	}}

	/* ===== EXPANDER ===== */
	.streamlit-expanderHeader {{
	border-left: 3px solid {MID_BLUE};
	background-color: rgba(17, 86, 127, 0.1) !important;
	}}

	/* ===== CODE BLOCKS ===== */
	code {{
	background-color: rgba(17, 86, 127, 0.2);
	padding: 0.2em 0.4em;
	border-radius: 3px;
	color: {STAR_BLUE};
	}}

	/* ===== SCROLLBAR ===== */
	::-webkit-scrollbar {{
	width: 8px;
	height: 8px;
	}}

	::-webkit-scrollbar-track {{
	background: #1a1a2e;
	}}

	::-webkit-scrollbar-thumb {{
	background: {MID_BLUE};
	border-radius: 4px;
	}}

	::-webkit-scrollbar-thumb:hover {{
	background: {SNOWFLAKE_BLUE};
	}}

	/* ===== ROOT VARIABLES ===== */
	:root {{
	--primary-color: {SNOWFLAKE_BLUE} !important;
	}}

	/* ===== MULTISELECT STYLING ===== */
	/* Tag filter multiselect - MID_BLUE (gradient start) */
	div[data-testid="stHorizontalBlock"] > div:first-child .stMultiSelect [data-baseweb="tag"] {{
	background-color: {MID_BLUE} !important;
	color: white !important;
	}}

	/* Column selector multiselect - SNOWFLAKE_BLUE (gradient end) */
	div[data-testid="stHorizontalBlock"] > div:last-child .stMultiSelect [data-baseweb="tag"] {{
	background-color: {SNOWFLAKE_BLUE} !important;
	color: white !important;
	}}

	/* Default multiselect styling */
	.stMultiSelect [data-baseweb="tag"] {{
	border-radius: 12px !important;
	padding: 2px 10px !important;
	margin: 2px !important;
	font-weight: 500 !important;
	}}

	.stMultiSelect [data-baseweb="tag"] span {{
	color: inherit !important;
	}}

	/* Remove button in tag */
	.stMultiSelect [data-baseweb="tag"] svg {{
	color: white !important;
	opacity: 0.8;
	}}

	.stMultiSelect [data-baseweb="tag"] svg:hover {{
	opacity: 1;
	}}

	/* Placeholder text */
	.stMultiSelect input::placeholder {{
	color: {MEDIUM_GRAY} !important;
	}}
	</style>
	""", unsafe_allow_html=True)


	# Data paths
	EVAL_RESULTS_PATH = Path(CACHE_PATH) / "eval-results"
	EVAL_REQUESTS_PATH = Path(CACHE_PATH) / "eval-queue"


	@st.cache_data(ttl=300) # Cache for 5 minutes
	def download_data():
	"""Download data from HuggingFace Hub."""
	try:
	snapshot_download(
	repo_id=QUEUE_REPO,
	local_dir=str(EVAL_REQUESTS_PATH),
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN,
	)
	except Exception as e:
	st.warning(f"Could not download queue data: {e}")

	try:
	snapshot_download(
	repo_id=RESULTS_REPO,
	local_dir=str(EVAL_RESULTS_PATH),
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN,
	)
	except Exception as e:
	st.warning(f"Could not download results data: {e}")


	class ModelType:
	API = "api"
	OPEN_WEIGHT = "open-weight"

	@staticmethod
	def get_color(model_type: str) -> str:
	if model_type == ModelType.API:
	return VALENCIA_ORANGE
	elif model_type == ModelType.OPEN_WEIGHT:
	return STAR_BLUE
	return MEDIUM_GRAY


	# Load SVG icons from local assets folder
	ASSETS_PATH = Path(__file__).resolve().parent / "assets"


	def load_svg_icon(icon_name: str, fill_color: str = None) -> str:
	"""Load SVG icon and return as data URI with optional color replacement.

	This matches the Gradio app's load_svg_data_uri function.
	"""
	svg_file = ASSETS_PATH / f"{icon_name}.svg"
	if not svg_file.exists():
	return ""

	try:
	with open(svg_file, "r", encoding="utf-8") as f:
	svg_content = f.read()

	# Replace black fill with specified color for visibility on dark background
	if fill_color:
	svg_content = svg_content.replace('fill="black"', f'fill="{fill_color}"')
	svg_content = svg_content.replace('stroke="black"', f'stroke="{fill_color}"')

	b64 = base64.b64encode(svg_content.encode()).decode()
	return f"data:image/svg+xml;base64,{b64}"
	except Exception:
	return ""


	def load_png_icon(icon_name: str) -> str:
	"""Load PNG icon and return as data URI."""
	png_file = ASSETS_PATH / f"{icon_name}.png"
	if not png_file.exists():
	return ""
	try:
	with open(png_file, "rb") as f:
	png_bytes = f.read()
	b64 = base64.b64encode(png_bytes).decode()
	return f"data:image/png;base64,{b64}"
	except Exception:
	return ""


	# Preload icons with Snowflake colors (matching Gradio app)
	ICON_CLOUD = load_svg_icon("snow_cloud2", VALENCIA_ORANGE) # Orange cloud for API (same as Gradio)
	ICON_CODE = load_svg_icon("snow_code", STAR_BLUE) # Blue code for open-weight (same as Gradio)
	ICON_HUMAN = load_png_icon("human_performance")

	# Tab header icons - use white to match header text color
	HEADER_ICON_COLOR = "#FFFFFF"
	ICON_MEDAL = load_svg_icon("snow_medal", HEADER_ICON_COLOR) # Leaderboard header icon
	ICON_EYE = load_svg_icon("snow_eye", HEADER_ICON_COLOR) # Analysis header icon
	ICON_DOCS = load_svg_icon("snow_docs", HEADER_ICON_COLOR) # About header icon
	ICON_WRITE = load_svg_icon("snow_write", HEADER_ICON_COLOR) # Submit header icon


	def generate_placeholder_description(model_name: str, tags: list, model_type: str) -> str:
	"""Generate a placeholder description based on model metadata."""
	parts = []

	# Describe model type
	if model_type == "api":
	parts.append("API-based")
	elif model_type == "open-weight":
	parts.append("Open-weight")

	# Describe approach based on tags
	if tags:
	if "Agentic" in tags:
	parts.append("agentic system")
	elif "Conventional RAG" in tags:
	parts.append("RAG pipeline")
	else:
	parts.append("model")

	# Add tool/capability info
	capabilities = []
	if "Sparse Search" in tags:
	capabilities.append("sparse search")
	if "Semantic Search Tool" in tags:
	capabilities.append("semantic search")
	if "Vision and Language" in tags:
	capabilities.append("vision")
	if "Text-only" in tags:
	capabilities.append("text-only")

	if capabilities:
	parts.append(f"with {', '.join(capabilities)}")
	else:
	parts.append("model")

	return " ".join(parts) if parts else ""


	def get_model_type_html(model_type: str) -> str:
	"""Get HTML for model type with icon and colored text."""
	color = ModelType.get_color(model_type)
	icon_uri = ICON_CLOUD if model_type == ModelType.API else ICON_CODE

	# Fallback emoji if icon doesn't load
	fallback_emoji = "☁️" if model_type == ModelType.API else "</>"

	if icon_uri:
	return f'''<div style="display: inline-flex; align-items: center; white-space: nowrap;">
	<img src="{icon_uri}" style="width: 20px; height: 20px; vertical-align: middle;" />
	<span style="color: {color}; font-weight: 500; margin-left: 6px;">{model_type}</span>
	</div>'''
	# Fallback without icon
	return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'


	def _extract_timestamp_from_filename(filename: str) -> str:
	"""Extract timestamp from filename like 'Model_results_20260109_152104.json'."""
	import re
	match = re.search(r'_(\d{8}_\d{6})\.json$', filename)
	return match.group(1) if match else "00000000_000000"


	def _detect_effort_uniform(result_file: Path, data: dict) -> bool:
	"""Check if all predictions in the companion JSONL have the same effort value."""
	pred_rel = data.get("source_predictions_file")
	if pred_rel:
	pred_path = Path(EVAL_RESULTS_PATH) / pred_rel
	else:
	pred_path = Path(str(result_file).replace("_results_", "_predictions_").replace(".json", ".jsonl"))

	if not pred_path.exists():
	return False

	try:
	effort_values = set()
	with open(pred_path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	pred = json.loads(line)
	search_history = pred.get('search_history', [])
	steps = len(search_history) if isinstance(search_history, list) and search_history else 0
	if steps == 0:
	steps = pred.get('iterations', 0)
	try:
	steps = float(steps) if steps else 0
	except (TypeError, ValueError):
	steps = 0
	effort_dict = {
	'steps': steps,
	'llm_calls': pred.get('llm_calls') or (pred.get('trajectory', {}) or {}).get('llm_calls'),
	'effort': pred.get('effort') or (pred.get('trajectory', {}) or {}).get('effort'),
	}
	val = get_effort_value(effort_dict)
	if val > 0:
	effort_values.add(val)
	if len(effort_values) > 1:
	return False
	return len(effort_values) == 1
	except Exception:
	return False


	@st.cache_data(ttl=300) # Cache for 5 minutes
	def load_eval_results() -> pd.DataFrame:
	"""Load evaluation results from JSON files, keeping only the most recent per model."""
	seen_models = {} # Track: model_name -> (timestamp, result_dict, filepath)

	results_path = Path(EVAL_RESULTS_PATH)
	if not results_path.exists():
	return pd.DataFrame()

	for org_dir in results_path.iterdir():
	if org_dir.is_dir() and not org_dir.name.startswith('.'):
	for result_file in org_dir.glob("_results_.json"):
	try:
	with open(result_file) as f:
	data = json.load(f)

	# Extract data
	model_name = data.get("model_name", "Unknown")
	metadata = data.get("metadata", {})
	result_scores = data.get("results", {})

	# Get tags - default to ["Agentic"] if not specified
	tags = data.get("tags", metadata.get("tags", ["Agentic"]))
	if isinstance(tags, str):
	tags = [tags] # Convert single tag to list

	# Get per-domain scores if available
	by_domain = result_scores.get("by_domain", {})

	# Use semantic accuracy if available, otherwise fall back to ANLS*
	overall = result_scores.get("overall", {})
	single_ev = result_scores.get("single_evidence", {})
	multi_page = result_scores.get("multi_evidence_same_doc", {})
	multi_doc = result_scores.get("multi_evidence_multi_doc", {})

	# Primary metric: semantic (ANLS* + LLM) if available, otherwise ANLS*
	semantic_acc = overall.get("semantic", overall.get("anls", 0.0))
	semantic_ci = overall.get("semantic_ci") # 95% CI tuple
	semantic_se = None

	# Calculate CI/SE on-the-fly using bias correction if not stored
	if semantic_acc > 0:
	try:
	from metrics import confidence_interval, standard_error
	n = result_scores.get("single_evidence", {}).get("n", 500)
	p = semantic_acc / 100.0 # Convert to proportion
	if not semantic_ci:
	ci = confidence_interval(p, n) # Uses calibrated q0, q1, m0, m1
	semantic_ci = (ci[0] * 100, ci[1] * 100)
	if semantic_se is None:
	semantic_se = standard_error(p, n) * 100 # SE in percentage points
	except Exception:
	semantic_ci = semantic_ci if semantic_ci else None
	semantic_se = semantic_se if semantic_se is not None else None

	anls_acc = overall.get("anls", 0.0)

	# Detect effort uniformity for Agentic models with Kuiper
	kuiper_val = overall.get("kuiper", 0.0)
	is_agentic = "Agentic" in tags if isinstance(tags, list) else False
	effort_uniform = False
	if is_agentic and kuiper_val and EVAL_AVAILABLE:
	effort_uniform = _detect_effort_uniform(result_file, data)

	result_dict = {
	"Model": model_name,
	"Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
	"Model Type": metadata.get("model_type", "unknown"),
	"Tags": tags, # Store as list
	# Primary: Accuracy with LLM judge (ANLS* + LLM with bias correction)
	"Accuracy (LLM judge)": semantic_acc,
	"_Accuracy_SE": semantic_se, # Hidden: for ±SE display
	"_Accuracy_CI": semantic_ci, # Hidden: for tooltip display
	"Acc. Single-Hop": single_ev.get("semantic", single_ev.get("anls", 0.0)),
	"Acc. Cross-Page": multi_page.get("semantic", multi_page.get("anls", 0.0)),
	"Acc. Cross-Doc": multi_doc.get("semantic", multi_doc.get("anls", 0.0)),
	# Secondary: Pure string-based ANLS* (hidden by default)
	"ANLS* (string)": anls_acc,
	# Attribution metrics
	"Attribution (Page F1)": overall.get("page_f1", 0.0),
	"Attribution (Doc F1)": overall.get("doc_f1", 0.0),
	# Calibration metric
	"Effort (Kuiper)": kuiper_val,
	"_effort_uniform": effort_uniform,
	"Submission Date": data.get("submission_date", ""),
	"Link": data.get("link", ""),
	"Description": data.get("description", metadata.get("description", "")) or
	generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
	# Per-domain scores (stored as JSON string for DataFrame compatibility)
	"_by_domain": json.dumps(by_domain) if by_domain else "{}",
	}

	# Extract timestamp from filename
	file_timestamp = _extract_timestamp_from_filename(result_file.name)

	# Keep only the most recent result per model
	if model_name not in seen_models or file_timestamp > seen_models[model_name][0]:
	seen_models[model_name] = (file_timestamp, result_dict)

	except Exception as e:
	st.warning(f"Error loading {result_file}: {e}")

	if not seen_models:
	return pd.DataFrame()

	# Build results list from deduplicated models
	results = [result_dict for _, result_dict in seen_models.values()]

	df = pd.DataFrame(results)
	df = df.sort_values("Accuracy (LLM judge)", ascending=False).reset_index(drop=True)
	return df


	def get_all_tags_from_df(df: pd.DataFrame) -> list:
	"""Extract all unique tags from the DataFrame."""
	all_tags = set()
	if "Tags" in df.columns:
	for tags in df["Tags"]:
	if isinstance(tags, list):
	all_tags.update(tags)
	return sorted(list(all_tags))


	def filter_df_by_tags(df: pd.DataFrame, selected_tags: list) -> pd.DataFrame:
	"""Filter DataFrame to show only rows that have at least one of the selected tags."""
	if not selected_tags:
	return df

	def has_any_tag(row_tags):
	if not isinstance(row_tags, list):
	return False
	return any(tag in row_tags for tag in selected_tags)

	return df[df["Tags"].apply(has_any_tag)]


	def render_tags_html(tags: list) -> str:
	"""Render tags as styled badges."""
	if not tags or not isinstance(tags, list):
	return ""

	badges = []
	for tag in tags:
	color = TAG_COLORS.get(tag, MID_BLUE)
	# Use lighter background with colored border for better readability
	badge = f'''<span style="
	display: inline-block;
	padding: 2px 8px;
	margin: 2px 3px;
	border-radius: 12px;
	font-size: 11px;
	font-weight: 500;
	background-color: {color}20;
	color: {color};
	border: 1px solid {color};
	white-space: nowrap;
	">{tag}</span>'''
	badges.append(badge)

	return "".join(badges)


	def format_model_name(row) -> str:
	"""Format model name with optional link."""
	model_name = row["Model"]
	link = row.get("Link", "")
	if link and link.strip():
	return f'<a href="{link}" target="_blank">{model_name}</a>'
	return model_name


	def format_model_type(model_type: str) -> str:
	"""Format model type with icon and color."""
	icon = ModelType.get_icon(model_type)
	color = ModelType.get_color(model_type)
	return f'<span style="color: {color};">{icon} {model_type}</span>'


	# Metric tooltips for table headers
	METRIC_TOOLTIPS = {
	"Accuracy (LLM judge)": "Answer accuracy using ANLS* + LLM judge with bias correction. Captures semantic correctness beyond string matching. Higher is better.",
	"ANLS* (string)": "String-based accuracy using ANLS* (Average Normalized Levenshtein Similarity). Stricter than semantic. Higher is better.",
	"Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
	"Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
	"Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
	"Attribution (Page F1)": "F1 score for page-level attribution. Measures overlap between cited pages and gold evidence. Higher is better.",
	"Attribution (Doc F1)": "F1 score for document-level attribution. Measures whether the correct documents were identified. Higher is better.",
	"Effort (Kuiper)": "Effort calibration metric (Kuiper statistic). Measures if effort correlates with problem difficulty. Lower is better.",
	"Model Type": "API = cloud-based model, open-weight = downloadable weights",
	"Tags": "Approach characteristics: Agentic, RAG, search tools, vision capabilities, etc.",
	}


	def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_column: bool = True, uncertainty_mode: str = "± SE"):
	"""Render an HTML table matching the Gradio leaderboard style.

	Args:
	uncertainty_mode: One of "± SE", "90% CI", "95% CI", or "None"
	"""
	if df.empty:
	st.warning("No data available")
	return

	# Build table HTML with tooltips
	header_cells = []
	for col in columns:
	# Add line break before brackets for cleaner display
	display_col = col.replace(" (", "<br>(") if " (" in col else col
	tooltip = METRIC_TOOLTIPS.get(col, "")
	if tooltip:
	header_cells.append(f'<th title="{tooltip}" style="cursor: help;">{display_col}</th>')
	else:
	header_cells.append(f'<th>{display_col}</th>')

	# Add "Analyze" column header
	if show_analyze_column:
	header_cells.append('<th style="width: 70px;">Analyze</th>')

	header_cells = "".join(header_cells)

	# Columns that should be merged for human performance rows
	HUMAN_MERGE_COLS = ["Model", "Organization", "Model Type"]

	rows_html = ""
	for _, row in df.iterrows():
	cells = []
	model_name = row.get("Model", "")
	organization = row.get("Organization", "")
	hide_attrib_kuiper = model_name == "Human with Oracle Retriever"

	# Check if this is a human performance row (should merge Model, Organization, Model Type)
	is_human_row = organization == "Humanity"

	# Calculate colspan for human rows (count how many merge columns are in selected columns)
	human_colspan = sum(1 for col in HUMAN_MERGE_COLS if col in columns) if is_human_row else 1

	for col in columns:
	value = row.get(col, "")

	# Skip Organization and Model Type for human rows (they're merged into Model)
	if is_human_row and col in ["Organization", "Model Type"]:
	continue

	if col == "Model":
	# Model name with optional link and description
	link = row.get("Link", "")
	description = row.get("Description", "")
	human_icon_html = ""
	if is_human_row and ICON_HUMAN:
	human_icon_html = (
	f'<img src="{ICON_HUMAN}" alt="Human baseline" '
	'style="width: 20px; height: 20px; vertical-align: text-bottom; margin-right: 6px;" />'
	)

	if link and str(link).strip():
	name_html = f'{human_icon_html}<a href="{link}" target="_blank" style="color: #29B5E8; font-weight: 500;">{value}</a>'
	else:
	name_html = f'{human_icon_html}<span style="font-weight: 500;">{value}</span>'

	if description and str(description).strip():
	cell_html = f'{name_html}<br><span style="font-size: 12px; color: {MEDIUM_GRAY}; font-weight: normal;">{description}</span>'
	else:
	cell_html = name_html

	# For human rows, use colspan to span Model, Organization, and Model Type columns
	if is_human_row and human_colspan > 1:
	cells.append(f'<td colspan="{human_colspan}">{cell_html}</td>')
	else:
	cells.append(f'<td>{cell_html}</td>')
	elif col == "Model Type":
	# Model type with icon
	cell_html = get_model_type_html(str(value))
	cells.append(f'<td style="text-align: center;">{cell_html}</td>')
	elif col == "Tags":
	# Render tags as badges
	cell_html = render_tags_html(value)
	cells.append(f'<td>{cell_html}</td>')
	elif col == "Accuracy (LLM judge)" or col == "ANLS* (string)" or col.startswith("Acc."):
	# Format accuracy scores (scale 0-100)
	try:
	acc_val = f"{float(value):.1f}" if value else "0"
	acc_float = float(value) if value else 0
	except (ValueError, TypeError):
	acc_val = str(value)
	acc_float = 0

	# Add uncertainty based on mode
	cell_html = acc_val
	if uncertainty_mode != "None" and col == "Accuracy (LLM judge)":
	se = row.get("_Accuracy_SE")
	ci = row.get("_Accuracy_CI")

	if uncertainty_mode == "± SE" and se is not None and se > 0:
	ci_tooltip = f"95% CI: [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
	uncertainty_text = f'<span style="font-size: 0.85em; color: #888;" title="{ci_tooltip}"> ± {se:.1f}</span>'
	cell_html = f'{acc_val}{uncertainty_text}'
	elif uncertainty_mode == "95% CI" and ci:
	uncertainty_text = f'<span style="font-size: 0.8em; color: #888;"> [{ci[0]:.1f}-{ci[1]:.1f}]</span>'
	cell_html = f'{acc_val}{uncertainty_text}'
	elif uncertainty_mode == "90% CI" and se is not None and se > 0:
	# 90% CI: z=1.645 instead of 1.96, so CI is ~84% of 95% CI width
	z_90 = 1.645
	half_width = se * z_90
	ci_90_low = max(0, acc_float - half_width)
	ci_90_high = min(100, acc_float + half_width)
	uncertainty_text = f'<span style="font-size: 0.8em; color: #888;"> [{ci_90_low:.1f}-{ci_90_high:.1f}]</span>'
	cell_html = f'{acc_val}{uncertainty_text}'

	elif uncertainty_mode != "None" and col.startswith("Acc.") and acc_float > 0:
	# Compute uncertainty for breakdown accuracy columns
	n_approx = 150 # Rough estimate for breakdown categories
	p = acc_float / 100.0
	if 0 < p < 1:
	from math import sqrt
	se_raw = sqrt(p * (1 - p) / n_approx)
	se_adj = se_raw / (LLM_JUDGE_SPECIFICITY + LLM_JUDGE_SENSITIVITY - 1) * 100

	if uncertainty_mode == "± SE":
	uncertainty_text = f'<span style="font-size: 0.85em; color: #888;"> ± {se_adj:.1f}</span>'
	cell_html = f'{acc_val}{uncertainty_text}'
	elif uncertainty_mode == "95% CI":
	half_width = se_adj * 1.96
	ci_low = max(0, acc_float - half_width)
	ci_high = min(100, acc_float + half_width)
	uncertainty_text = f'<span style="font-size: 0.8em; color: #888;"> [{ci_low:.1f}-{ci_high:.1f}]</span>'
	cell_html = f'{acc_val}{uncertainty_text}'
	elif uncertainty_mode == "90% CI":
	half_width = se_adj * 1.645
	ci_low = max(0, acc_float - half_width)
	ci_high = min(100, acc_float + half_width)
	uncertainty_text = f'<span style="font-size: 0.8em; color: #888;"> [{ci_low:.1f}-{ci_high:.1f}]</span>'
	cell_html = f'{acc_val}{uncertainty_text}'

	cells.append(f'<td style="text-align: center;">{cell_html}</td>')
	elif col.startswith("Attribution"):
	# Format F1 scores (scale 0-100) - NOT bias-adjusted
	if hide_attrib_kuiper:
	cells.append('<td style="text-align: center;">—</td>')
	continue
	try:
	attr_val = f"{float(value):.1f}" if value else "0"
	attr_float = float(value) if value else 0
	except (ValueError, TypeError):
	attr_val = str(value)
	attr_float = 0

	cell_html = attr_val
	# Add uncertainty for attribution metrics (simple binomial, no bias adjustment)
	if uncertainty_mode != "None" and attr_float > 0:
	n_approx = 500 # Test set size
	p = attr_float / 100.0
	if 0 < p < 1:
	from math import sqrt
	se = sqrt(p * (1 - p) / n_approx) * 100 # No bias adjustment

	if uncertainty_mode == "± SE":
	uncertainty_text = f'<span style="font-size: 0.85em; color: #888;"> ± {se:.1f}</span>'
	cell_html = f'{attr_val}{uncertainty_text}'
	elif uncertainty_mode == "95% CI":
	half_width = se * 1.96
	ci_low = max(0, attr_float - half_width)
	ci_high = min(100, attr_float + half_width)
	uncertainty_text = f'<span style="font-size: 0.8em; color: #888;"> [{ci_low:.1f}-{ci_high:.1f}]</span>'
	cell_html = f'{attr_val}{uncertainty_text}'
	elif uncertainty_mode == "90% CI":
	half_width = se * 1.645
	ci_low = max(0, attr_float - half_width)
	ci_high = min(100, attr_float + half_width)
	uncertainty_text = f'<span style="font-size: 0.8em; color: #888;"> [{ci_low:.1f}-{ci_high:.1f}]</span>'
	cell_html = f'{attr_val}{uncertainty_text}'

	cells.append(f'<td style="text-align: center;">{cell_html}</td>')
	elif col == "Effort (Kuiper)":
	# Format Kuiper statistic (lower is better for calibration)
	# Hide for Conventional RAG models (not meaningful)
	if hide_attrib_kuiper:
	cells.append('<td style="text-align: center;">—</td>')
	continue
	tags = row.get("Tags", [])
	is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
	if is_conventional_rag:
	cell_html = "—"
	else:
	try:
	cell_html = f"{float(value):.1f}" if value else "0"
	except (ValueError, TypeError):
	cell_html = str(value)
	if row.get("_effort_uniform", False) and cell_html != "0":
	tooltip = "This agent uses the same effort for all samples, so effort-invariance metric is not meaningful."
	cell_html = f'<span style="color: #888; cursor: help;" title="{tooltip}">({cell_html})</span>'
	cells.append(f'<td style="text-align: center;">{cell_html}</td>')
	elif col == "Organization":
	cell_html = str(value) if value else ""
	cells.append(f'<td style="text-align: center;">{cell_html}</td>')
	else:
	cell_html = str(value) if value else ""
	cells.append(f'<td>{cell_html}</td>')

	# Add "Analyze" link cell
	if show_analyze_column:
	# URL-encode the model name for query param
	encoded_name = quote(str(model_name))
	analyze_link = f'<a href="?analyze={encoded_name}" target="_self" title="View detailed analysis">View</a>'
	cells.append(f'<td style="text-align: center;">{analyze_link}</td>')

	rows_html += f'<tr>{"".join(cells)}</tr>'

	table_html = f'''
	<style>
	.leaderboard-wrapper {{
	border: 2px solid {MID_BLUE};
	border-radius: 8px;
	overflow: hidden;
	font-size: 0;
	}}
	.leaderboard-table {{
	width: 100%;
	border-collapse: collapse;
	border-spacing: 0;
	font-size: 14px;
	background-color: #0e1117;
	margin: 0;
	padding: 0;
	border: none;
	}}
	.leaderboard-table thead tr {{
	background: linear-gradient(135deg, {MID_BLUE} 0%, {SNOWFLAKE_BLUE} 100%);
	}}
	.leaderboard-table thead th {{
	background: transparent;
	color: white;
	text-align: center;
	padding: 1.2em 0.75em;
	font-weight: 500;
	border: none;
	text-transform: none;
	}}
	.leaderboard-table thead th:not(:last-child) {{
	border-right: 1px solid rgba(255,255,255,0.15);
	}}
	.leaderboard-table tbody td {{
	padding: 0.75em;
	border-bottom: 1px solid {MEDIUM_GRAY}40;
	vertical-align: middle;
	color: white;
	}}
	.leaderboard-table tbody tr:last-child td {{
	border-bottom: none;
	}}
	.leaderboard-table tbody tr:nth-child(even) {{
	background-color: rgba(17, 86, 127, 0.12);
	}}
	.leaderboard-table tbody tr:hover {{
	background-color: rgba(17, 86, 127, 0.25);
	}}
	.leaderboard-table td:first-child {{
	min-width: 280px;
	max-width: 350px;
	word-wrap: break-word;
	}}
	/* Links in table use Snowflake Blue */
	.leaderboard-table a {{
	color: {SNOWFLAKE_BLUE};
	text-decoration: none;
	}}
	.leaderboard-table a:hover {{
	color: {STAR_BLUE};
	text-decoration: underline;
	}}
	</style>
	<div class="leaderboard-wrapper">
	<table class="leaderboard-table">
	<thead>
	<tr>{header_cells}</tr>
	</thead>
	<tbody>
	{rows_html}
	</tbody>
	</table>
	</div>
	'''

	st.markdown(table_html, unsafe_allow_html=True)


	def build_csv_download_df(df: pd.DataFrame, columns: list, uncertainty_mode: str) -> pd.DataFrame:
	"""Build a CSV-friendly DataFrame with uncertainty text included."""
	if df.empty or not columns:
	return pd.DataFrame()

	export_df = df[columns].copy()

	for idx in export_df.index:
	row = df.loc[idx]
	for col in columns:
	value = row.get(col, "")

	if col == "Accuracy (LLM judge)" or col == "ANLS* (string)" or col.startswith("Acc."):
	try:
	acc_float = float(value) if value else 0.0
	acc_val = f"{acc_float:.1f}"
	except (ValueError, TypeError):
	export_df.at[idx, col] = value
	continue

	text = acc_val
	if uncertainty_mode != "None":
	if col == "Accuracy (LLM judge)":
	se = row.get("_Accuracy_SE")
	ci = row.get("_Accuracy_CI")
	if uncertainty_mode == "± SE" and se is not None and se > 0:
	text = f"{acc_val} ± {se:.1f}"
	elif uncertainty_mode == "95% CI":
	if ci:
	text = f"{acc_val} [{ci[0]:.1f}-{ci[1]:.1f}]"
	elif se is not None and se > 0:
	half_width = se * 1.96
	text = f"{acc_val} [{max(0, acc_float - half_width):.1f}-{min(100, acc_float + half_width):.1f}]"
	elif uncertainty_mode == "90% CI" and se is not None and se > 0:
	half_width = se * 1.645
	text = f"{acc_val} [{max(0, acc_float - half_width):.1f}-{min(100, acc_float + half_width):.1f}]"
	elif col.startswith("Acc.") and acc_float > 0:
	n_approx = 150
	p = acc_float / 100.0
	if 0 < p < 1:
	from math import sqrt
	se_raw = sqrt(p * (1 - p) / n_approx)
	se_adj = se_raw / (LLM_JUDGE_SPECIFICITY + LLM_JUDGE_SENSITIVITY - 1) * 100
	if uncertainty_mode == "± SE":
	text = f"{acc_val} ± {se_adj:.1f}"
	elif uncertainty_mode == "95% CI":
	half_width = se_adj * 1.96
	text = f"{acc_val} [{max(0, acc_float - half_width):.1f}-{min(100, acc_float + half_width):.1f}]"
	elif uncertainty_mode == "90% CI":
	half_width = se_adj * 1.645
	text = f"{acc_val} [{max(0, acc_float - half_width):.1f}-{min(100, acc_float + half_width):.1f}]"

	export_df.at[idx, col] = text

	elif col.startswith("Attribution"):
	try:
	attr_float = float(value) if value else 0.0
	attr_val = f"{attr_float:.1f}"
	except (ValueError, TypeError):
	export_df.at[idx, col] = value
	continue

	text = attr_val
	if uncertainty_mode != "None" and attr_float > 0:
	n_approx = 500
	p = attr_float / 100.0
	if 0 < p < 1:
	from math import sqrt
	se = sqrt(p * (1 - p) / n_approx) * 100
	if uncertainty_mode == "± SE":
	text = f"{attr_val} ± {se:.1f}"
	elif uncertainty_mode == "95% CI":
	half_width = se * 1.96
	text = f"{attr_val} [{max(0, attr_float - half_width):.1f}-{min(100, attr_float + half_width):.1f}]"
	elif uncertainty_mode == "90% CI":
	half_width = se * 1.645
	text = f"{attr_val} [{max(0, attr_float - half_width):.1f}-{min(100, attr_float + half_width):.1f}]"

	export_df.at[idx, col] = text

	return export_df


	def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
	"""Create scatter plot of Accuracy vs Attribution."""
	if df.empty:
	fig = go.Figure()
	fig.add_annotation(
	text="No data available",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False,
	font=dict(size=20, color="white")
	)
	return fig

	color_map = {
	"api": VALENCIA_ORANGE, # Orange for API
	"open-weight": STAR_BLUE, # Star Blue for open-weight
	}

	fig = go.Figure()

	for model_type in df["Model Type"].unique():
	df_type = df[df["Model Type"] == model_type]
	fig.add_trace(go.Scatter(
	x=df_type["Attribution (Page F1)"],
	y=df_type["Accuracy (LLM judge)"],
	mode="markers",
	name=model_type,
	text=df_type["Model"],
	marker=dict(
	size=12,
	color=color_map.get(model_type, MEDIUM_GRAY),
	line=dict(width=1.5, color="white")
	),
	hovertemplate="<b>%{text}</b><br>Attribution: %{x:.1f}<br>Accuracy: %{y:.1f}<extra></extra>",
	))

	fig.update_layout(
	title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
	xaxis_title="Attribution (Page F1)",
	yaxis_title="Accuracy (LLM judge)",
	hovermode="closest",
	template="plotly_dark",
	height=650,
	showlegend=True,
	legend=dict(title="Model Type", yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(color="#ccc")),
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(14,17,23,0.8)",
	xaxis=dict(gridcolor=MID_BLUE, zerolinecolor=MID_BLUE),
	yaxis=dict(gridcolor=MID_BLUE, zerolinecolor=MID_BLUE),
	)

	return fig


	def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
	"""Create scatter plot of Accuracy vs Effort (Kuiper)."""
	# Filter out Conventional RAG models (Kuiper not meaningful for them)
	def is_not_conventional_rag(tags):
	if isinstance(tags, list):
	return "Conventional RAG" not in tags
	return True

	df_filtered = df[df["Tags"].apply(is_not_conventional_rag)]

	if df_filtered.empty:
	fig = go.Figure()
	fig.add_annotation(
	text="No data available",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False,
	font=dict(size=20, color="white")
	)
	return fig

	color_map = {
	"api": VALENCIA_ORANGE, # Orange for API
	"open-weight": STAR_BLUE, # Star Blue for open-weight
	}

	fig = go.Figure()

	for model_type in df_filtered["Model Type"].unique():
	df_type = df_filtered[df_filtered["Model Type"] == model_type]
	fig.add_trace(go.Scatter(
	x=df_type["Effort (Kuiper)"],
	y=df_type["Accuracy (LLM judge)"],
	mode="markers",
	name=model_type,
	text=df_type["Model"],
	marker=dict(
	size=12,
	color=color_map.get(model_type, MEDIUM_GRAY),
	line=dict(width=1.5, color="white")
	),
	hovertemplate="<b>%{text}</b><br>Effort: %{x:.1f}<br>Accuracy: %{y:.1f}<extra></extra>",
	))

	fig.update_layout(
	title=dict(text="Accuracy vs Effort", font=dict(color="white")),
	xaxis_title="Effort (Kuiper) — lower is better",
	yaxis_title="Accuracy (LLM judge)",
	hovermode="closest",
	template="plotly_dark",
	height=650,
	showlegend=True,
	legend=dict(title="Model Type", yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(color="#ccc")),
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(14,17,23,0.8)",
	xaxis=dict(gridcolor=MID_BLUE, zerolinecolor=MID_BLUE),
	yaxis=dict(gridcolor=MID_BLUE, zerolinecolor=MID_BLUE),
	)

	return fig


	def create_domain_accuracy_chart(by_domain: dict, model_name: str, overall_accuracy: float = 0) -> go.Figure:
	"""Create a horizontal bar chart showing accuracy by domain."""
	# Filter out "Other" category
	filtered_domain = {k: v for k, v in by_domain.items() if k.lower() != 'other'}

	if not filtered_domain:
	fig = go.Figure()
	fig.add_annotation(
	text="No per-domain data available",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False,
	font=dict(size=16, color="white")
	)
	fig.update_layout(
	template="plotly_dark",
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(14,17,23,0.8)",
	)
	return fig

	# Sort domains by accuracy (descending)
	sorted_domains = sorted(filtered_domain.items(), key=lambda x: x[1].get('anls', 0), reverse=True)

	domains = [d[0] for d in sorted_domains]
	accuracies = [d[1].get('anls', 0) for d in sorted_domains]
	counts = [d[1].get('n', 0) for d in sorted_domains]

	# Color based on above/below overall accuracy
	colors = [SNOWFLAKE_BLUE if acc >= overall_accuracy else VALENCIA_ORANGE for acc in accuracies]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	y=domains,
	x=accuracies,
	orientation='h',
	marker=dict(
	color=colors,
	line=dict(width=1, color='white')
	),
	text=[f"{acc:.1f}% (n={n})" for acc, n in zip(accuracies, counts)],
	textposition='auto',
	textfont=dict(color='white', size=11),
	hovertemplate="<b>%{y}</b><br>Accuracy: %{x:.1f}%<extra></extra>",
	))

	fig.update_layout(
	title=dict(
	text=f"Accuracy by Domain: {model_name}",
	font=dict(color="white", size=16)
	),
	xaxis_title="Accuracy (ANLS* %)",
	yaxis_title="",
	template="plotly_dark",
	height=max(400, len(domains) * 35), # Dynamic height based on number of domains
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(14,17,23,0.8)",
	xaxis=dict(
	gridcolor=MID_BLUE,
	zerolinecolor=MID_BLUE,
	range=[0, 100]
	),
	yaxis=dict(
	gridcolor=MID_BLUE,
	autorange="reversed" # Keep highest at top
	),
	margin=dict(l=150, r=50, t=60, b=50),
	)

	return fig


	def show_model_details(model_name: str):
	"""Show detailed per-domain breakdown for a model."""
	# Load model data from cached DataFrame
	df = load_eval_results()

	if df.empty:
	st.warning("No model data available")
	return

	model_row = df[df["Model"] == model_name]
	if model_row.empty:
	st.warning(f"Model '{model_name}' not found")
	return

	model_data = model_row.iloc[0]

	# Check if this is a Conventional RAG model
	tags = model_data.get('Tags', [])
	is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False

	# Display main metrics
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Accuracy (LLM judge)", f"{model_data['Accuracy (LLM judge)']:.1f}%")
	with col2:
	st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
	with col3:
	if is_conventional_rag:
	st.metric("Effort (Kuiper)", "—")
	elif model_data.get('_effort_uniform', False):
	kuiper = model_data.get('Effort (Kuiper)', 0)
	st.metric("Effort (Kuiper)", f"({kuiper:.2f})" if kuiper else "N/A", help="This agent uses the same effort for all samples, so effort-invariance metric is not meaningful.")
	else:
	kuiper = model_data.get('Effort (Kuiper)', 0)
	st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")

	# Show note for Conventional RAG models
	if is_conventional_rag:
	st.caption("Effort (Kuiper) is only meaningful for Agentic systems with iterative search behavior.")

	# Display breakdown by hop type
	col1, col2, col3 = st.columns(3)
	with col1:
	single_hop = model_data.get('Acc. Single-Hop', 0)
	st.metric("Acc. Single-Hop", f"{single_hop:.1f}%" if single_hop else "N/A")
	with col2:
	cross_page = model_data.get('Acc. Cross-Page', 0)
	st.metric("Acc. Cross-Page", f"{cross_page:.1f}%" if cross_page else "N/A")
	with col3:
	cross_doc = model_data.get('Acc. Cross-Doc', 0)
	st.metric("Acc. Cross-Doc", f"{cross_doc:.1f}%" if cross_doc else "N/A")

	# Get per-domain data
	by_domain_str = model_data.get('_by_domain', '{}')
	try:
	by_domain = json.loads(by_domain_str) if isinstance(by_domain_str, str) else by_domain_str
	except (json.JSONDecodeError, TypeError):
	by_domain = {}

	if by_domain:
	# Show per-domain chart (use overall accuracy as threshold for coloring)
	overall_accuracy = model_data.get('Accuracy (LLM judge)', 0)
	fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
	st.plotly_chart(fig, width="stretch")
	else:
	st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")


	def _prediction_has_effort(pred: dict) -> bool:
	"""Check if a prediction contains at least one valid effort measure."""
	search_history = pred.get('search_history', [])
	if isinstance(search_history, list) and len(search_history) > 0:
	return True

	for key in ('iterations', 'steps', 'llm_calls', 'effort'):
	val = pred.get(key)
	if val is not None:
	try:
	if float(val) > 0:
	return True
	except (TypeError, ValueError):
	pass

	trajectory = pred.get('trajectory', {})
	if isinstance(trajectory, dict):
	for key in ('llm_calls', 'effort'):
	val = trajectory.get(key)
	if val is not None:
	try:
	if float(val) > 0:
	return True
	except (TypeError, ValueError):
	pass

	return False


	def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
	"""Validate JSONL submission format and return parsed predictions."""
	try:
	lines = file_content.strip().split("\n")
	if not lines or (len(lines) == 1 and not lines[0].strip()):
	return False, "File is empty", []

	predictions = []
	for line_num, line in enumerate(lines, 1):
	line = line.strip()
	if not line:
	continue

	try:
	pred = json.loads(line)
	except json.JSONDecodeError as e:
	return False, f"Line {line_num}: Invalid JSON - {str(e)}", []

	# Required: question and answer
	if "question" not in pred:
	return False, f"Line {line_num}: Missing required field 'question'", []
	if "answer" not in pred:
	return False, f"Line {line_num}: Missing required field 'answer'", []

	predictions.append(pred)

	return True, "", predictions

	except Exception as e:
	return False, f"Error reading file: {str(e)}", []


	@st.cache_data(ttl=3600) # Cache for 1 hour
	def derive_hop_type(evidence: list) -> str:
	"""Derive hop type from evidence list.

	- single: Single page from a single document
	- cross_page: Multiple pages from the same document
	- cross_doc: Pages from different documents

	Args:
	evidence: List of dicts with 'document' and 'page' keys

	Returns:
	'single', 'cross_page', or 'cross_doc'
	"""
	if not evidence:
	return 'single'

	# Get unique documents and pages
	documents = set()
	pages = set()

	for ev in evidence:
	doc = ev.get('document')
	page = ev.get('page')
	if doc is not None:
	documents.add(doc)
	if doc is not None and page is not None:
	pages.add((doc, page))

	# Determine hop type based on evidence structure
	if len(documents) > 1:
	return 'cross_doc' # Multiple documents
	elif len(pages) > 1:
	return 'cross_page' # Multiple pages from same document
	else:
	return 'single' # Single page


	def load_gold_standard(dataset_name: str = "agentic-document-ai/dataset-PRIVATE", split: str = "test"):
	"""Load gold standard from HuggingFace dataset.

	Note: Uses dataset-PRIVATE for test split (contains gold answers).
	"""
	if not EVAL_AVAILABLE:
	return {}, {}

	try:
	dataset = load_dataset(dataset_name, split=split)

	by_text = {}
	by_id = {}

	for ex in dataset:
	question = ex['question'].strip()
	qid = ex.get('id', '')

	# Try multiple field names for answers (different splits may use different names)
	answers = ex.get('answer_variants') or ex.get('answers') or []
	# If answers is a string, wrap it in a list
	if isinstance(answers, str):
	answers = [[answers]]
	# If answers is a flat list of strings, wrap each in a list
	elif answers and isinstance(answers[0], str):
	answers = [answers]

	evidence = ex.get('evidence', [])

	gold_data = {
	'answers': answers,
	'evidence': evidence,
	'category': ex.get('document_category', ''),
	'domain': ex.get('domain', ''),
	# Derive hop type from evidence structure
	'hop_type': derive_hop_type(evidence)
	}

	by_text[question] = gold_data
	if qid:
	by_id[qid] = gold_data

	return by_text, by_id
	except Exception as e:
	st.error(f"Error loading dataset: {e}")
	return {}, {}


	def _evaluate_single_item(args, max_retries=3):
	"""Evaluate a single prediction item (for parallel processing)."""
	import time as _time
	idx, pred, gold_data, use_llm_judge = args

	question = pred.get('question', '').strip()
	answer = pred.get('answer', '')
	citations = pred.get('citations', [])
	search_history = pred.get('search_history', [])
	steps = len(search_history) if search_history else pred.get('iterations', 0)

	# Look for effort metrics at top level or nested in 'trajectory'
	trajectory = pred.get('trajectory', {})
	# Ensure trajectory is a dict before calling .get() on it
	if not isinstance(trajectory, dict):
	trajectory = {}
	llm_calls = pred.get('llm_calls') or trajectory.get('llm_calls')
	effort = pred.get('effort') or trajectory.get('effort')

	# Calculate non-LLM metrics first
	anls = anls_star(answer, gold_data['answers'])
	doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
	page_f1 = citation_f1(citations, gold_data['evidence'], level='page')

	# Semantic accuracy with LLM judge (or just ANLS* if disabled)
	if use_llm_judge:
	for attempt in range(max_retries):
	try:
	llm_result = anls_star_llm(answer, gold_data['answers'], question)
	semantic_score = llm_result['score']
	break
	except Exception:
	if attempt < max_retries - 1:
	_time.sleep(2 ** attempt) # Exponential backoff
	else:
	raise
	else:
	semantic_score = anls

	return {
	'idx': idx,
	'question': question,
	'anls': anls,
	'semantic_score': semantic_score,
	'correct': semantic_score >= 0.5,
	'doc_f1': doc_f1['f1'],
	'page_f1': page_f1['f1'],
	'steps': steps,
	'llm_calls': llm_calls,
	'effort': effort,
	'hop_type': gold_data.get('hop_type', 'single'),
	'category': gold_data['category'],
	'domain': gold_data['domain']
	}


	def evaluate_predictions(
	predictions: list,
	gold_by_text: dict,
	gold_by_id: dict,
	use_llm_judge: bool = True,
	progress_callback=None
	) -> dict:
	"""Evaluate predictions against gold standard (parallelized when using LLM judge).

	Args:
	predictions: List of prediction dicts
	gold_by_text: Gold data indexed by question text
	gold_by_id: Gold data indexed by question ID
	use_llm_judge: If True, use ANLS*+LLM for semantic accuracy (default)
	progress_callback: Optional callback(current, total) for progress updates
	"""
	if not EVAL_AVAILABLE:
	return {"error": "Evaluation module not available"}

	# First pass: match predictions to gold standard
	matched_items = []
	unmatched = []

	for pred in predictions:
	question = pred.get('question', '').strip()
	qid = pred.get('id', '')

	# Match to gold
	gold_data = None
	if question in gold_by_text:
	gold_data = gold_by_text[question]
	elif qid and qid in gold_by_id:
	gold_data = gold_by_id[qid]

	if gold_data:
	matched_items.append((pred, gold_data, use_llm_judge))
	else:
	unmatched.append(question[:50] + "..." if len(question) > 50 else question)

	if not matched_items:
	return {"error": "No predictions matched the gold standard"}

	# Prepare items with index
	items_with_idx = [(i, pred, gold, llm) for i, (pred, gold, llm) in enumerate(matched_items)]

	total = len(items_with_idx)
	evals = []
	completed = 0

	# Parallel evaluation with ThreadPoolExecutor (much faster for LLM calls)
	with ThreadPoolExecutor(max_workers=MAX_EVAL_WORKERS) as executor:
	futures = {executor.submit(_evaluate_single_item, item): item[0]
	for item in items_with_idx}

	for future in as_completed(futures):
	result = future.result() # Will raise if failed after retries
	evals.append(result)
	completed += 1
	if progress_callback:
	progress_callback(completed, total)

	# Aggregate overall metrics
	n = len(evals)
	semantic_scores = [e['semantic_score'] for e in evals]

	# Apply bias correction for semantic accuracy
	if use_llm_judge:
	agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
	mean_semantic = agg['adjusted_score'] * 100
	semantic_ci = (agg['ci_lower'] * 100, agg['ci_upper'] * 100)
	else:
	mean_semantic = sum(semantic_scores) / n * 100
	semantic_ci = None

	mean_anls = sum(e['anls'] for e in evals) / n * 100
	accuracy = sum(e['correct'] for e in evals) / n * 100
	mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
	mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100

	# Kuiper statistic
	kuiper = kuiper_statistic(evals)

	# By hop type
	single_hop = [e for e in evals if e['hop_type'] == 'single']
	cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
	cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']

	# By domain
	by_domain = defaultdict(list)
	for e in evals:
	domain = e['domain'] or 'Other'
	by_domain[domain].append(e)

	domain_scores = {}
	for domain, domain_evals in sorted(by_domain.items()):
	domain_semantic_scores = [e['semantic_score'] for e in domain_evals]
	if use_llm_judge:
	domain_agg = aggregate_anls_star_llm(domain_semantic_scores, apply_bias_correction=True)
	domain_semantic = domain_agg['adjusted_score'] * 100
	else:
	domain_semantic = sum(domain_semantic_scores) / len(domain_semantic_scores) * 100
	domain_scores[domain] = {
	'semantic': domain_semantic,
	'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
	'n': len(domain_evals)
	}

	results = {
	'n_evaluated': n,
	'n_unmatched': len(unmatched),
	'unmatched_samples': unmatched[:5],
	'overall': {
	'semantic': mean_semantic, # Primary metric (ANLS* + LLM judge)
	'semantic_ci': semantic_ci, # 95% CI if LLM judge used
	'anls': mean_anls, # Secondary metric (pure ANLS*)
	'accuracy': accuracy,
	'doc_f1': mean_doc_f1,
	'page_f1': mean_page_f1,
	'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
	},
	'single_evidence': {
	'semantic': (
	aggregate_anls_star_llm([e['semantic_score'] for e in single_hop], apply_bias_correction=True)['adjusted_score'] * 100
	if (use_llm_judge and single_hop) else (sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0)
	),
	'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
	'n': len(single_hop)
	},
	'multi_evidence_same_doc': {
	'semantic': (
	aggregate_anls_star_llm([e['semantic_score'] for e in cross_page], apply_bias_correction=True)['adjusted_score'] * 100
	if (use_llm_judge and cross_page) else (sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0)
	),
	'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
	'n': len(cross_page)
	},
	'multi_evidence_multi_doc': {
	'semantic': (
	aggregate_anls_star_llm([e['semantic_score'] for e in cross_doc], apply_bias_correction=True)['adjusted_score'] * 100
	if (use_llm_judge and cross_doc) else (sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0)
	),
	'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
	'n': len(cross_doc)
	},
	'by_domain': domain_scores,
	'used_llm_judge': use_llm_judge
	}

	return results


	@st.fragment
	def submit_results_fragment():
	"""Fragment for file upload and evaluation."""
	# Check HuggingFace login
	hf_user = get_hf_user()

	if not hf_user:
	st.warning("Login Required: Please sign in with your HuggingFace account to submit results.")

	# Show login button
	if not show_login_button():
	st.info("""
	Login not available. This feature requires deployment on HuggingFace Spaces
	with `hf_oauth: true` in the Space's README.md metadata.

	For local testing, set: `TEST_HF_USER=your_username`
	""")
	return

	# Show logged-in user
	st.success(f"Logged in as {hf_user['username']}")

	# Check submission rate limit
	can_submit, limit_msg, hours_left = can_user_submit(hf_user['username'])

	if not can_submit:
	st.warning(f"Rate Limit: {limit_msg}")
	st.info("""
	This limit helps prevent overfitting to the test set.

	You can still evaluate locally on the dev set:
	```bash
	python evaluate.py your_predictions.jsonl --dataset agentic-document-ai/dataset --split dev
	```
	""")
	return

	# Step 1: Upload and Evaluate
	st.markdown("#### Step 1: Upload Predictions")

	# Two options: file upload or paste text
	upload_tab, paste_tab = st.tabs(["Upload File", "Paste JSONL"])

	with upload_tab:
	uploaded_file = st.file_uploader(
	"Upload your predictions JSONL file",
	type=["jsonl"],
	help="One prediction per line with 'question' and 'answer' fields",
	)

	with paste_tab:
	pasted_content = st.text_area(
	"Paste your JSONL content",
	height=200,
	help="One JSON object per line",
	placeholder='{"question": "...", "answer": "...", "citations": [...]}\n{"question": "...", "answer": "...", "citations": [...]}',
	)

	with st.expander("Expected JSONL format"):
	st.code('''{"question": "What is the total revenue?", "answer": "$1.2M", "citations": [{"file": "report.pdf", "page": 5}], "iterations": 3}
	{"question": "Who signed the contract?", "answer": ["John Smith", "Jane Doe"], "citations": [{"file": "contract.pdf", "page": 12}], "iterations": 2}''', language="json")
	st.markdown("""
	Required fields:
	- `question`: The question text (must match dataset)
	- `answer`: Predicted answer (string or list)

	Optional fields (for full metrics):
	- `citations`: List of `{"file": "...", "page": N}` for attribution metrics
	- `id`: Question ID (fallback matching)

	Effort fields (required for Agentic submissions, at least one per sample):
	- `steps`: Number of agentic steps taken (positive integer)
	- `search_history`: List of search queries performed (e.g. `["query1", "query2"]`)
	- `effort`: Generic effort measure (positive number), should be proportional to the number of searches, LLM calls, or reasoning tokens generated, in this order of preference
	""")

	# Initialize session state for evaluation results
	if 'eval_results' not in st.session_state:
	st.session_state.eval_results = None
	if 'predictions' not in st.session_state:
	st.session_state.predictions = None

	# Get content from either file upload or paste
	file_content = None
	if uploaded_file is not None:
	file_content = uploaded_file.read().decode("utf-8")
	elif pasted_content and pasted_content.strip():
	file_content = pasted_content.strip()

	if file_content:
	is_valid, error_msg, predictions = validate_jsonl_submission(file_content)

	if not is_valid:
	st.error(f"Invalid input: {error_msg}")
	else:
	st.success(f"Loaded {len(predictions)} predictions")
	st.session_state.predictions = predictions
	st.session_state.predictions_raw = file_content # Store raw content for upload

	# Evaluate button
	if st.button("Run Evaluation", type="primary"):
	with st.spinner("Loading gold standard..."):
	gold_by_text, gold_by_id = load_gold_standard()

	if not gold_by_text:
	st.error("Failed to load gold standard dataset")
	else:
	# Progress bar for evaluation
	progress_bar = st.progress(0, text="Evaluating predictions with semantic accuracy...")
	status_text = st.empty()

	def update_progress(current, total):
	progress_bar.progress(current / total, text=f"Evaluating {current}/{total}...")

	results = evaluate_predictions(
	predictions,
	gold_by_text,
	gold_by_id,
	use_llm_judge=True,
	progress_callback=update_progress
	)

	progress_bar.empty()
	status_text.empty()
	st.session_state.eval_results = results

	# Show evaluation results
	if st.session_state.eval_results:
	results = st.session_state.eval_results

	if 'error' in results:
	st.error(results['error'])
	else:
	st.markdown("#### Evaluation Results")

	# Summary metrics - use semantic accuracy as primary if available
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	if 'semantic' in results['overall']:
	ci = results['overall'].get('semantic_ci')
	ci_text = f" [{ci[0]:.1f}-{ci[1]:.1f}]" if ci else ""
	st.metric("Accuracy (LLM judge)", f"{results['overall']['semantic']:.1f}{ci_text}")
	else:
	st.metric("Accuracy (ANLS*)", f"{results['overall']['anls']:.1f}")
	with col2:
	st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
	with col3:
	kuiper_val = results['overall']['kuiper']
	st.metric("Effort (Kuiper)", f"{kuiper_val:.3f}" if kuiper_val else "N/A")
	with col4:
	st.metric("Evaluated", f"{results['n_evaluated']} / {results['n_evaluated'] + results['n_unmatched']}")

	# Detailed breakdown
	with st.expander("Detailed Breakdown"):
	# Check which metrics are available
	has_semantic = 'semantic' in results['overall']

	if has_semantic:
	st.markdown(f"""
	\| Metric \| Value \|
	\|--------\|-------\|
	\| Accuracy (LLM judge) \| {results['overall']['semantic']:.1f} \|
	\| ANLS* (string match) \| {results['overall']['anls']:.1f} \|
	\| Acc. Single-Hop (n={results['single_evidence']['n']}) \| {results['single_evidence'].get('semantic', results['single_evidence']['anls']):.1f} \|
	\| Acc. Cross-Page (n={results['multi_evidence_same_doc']['n']}) \| {results['multi_evidence_same_doc'].get('semantic', results['multi_evidence_same_doc']['anls']):.1f} \|
	\| Acc. Cross-Doc (n={results['multi_evidence_multi_doc']['n']}) \| {results['multi_evidence_multi_doc'].get('semantic', results['multi_evidence_multi_doc']['anls']):.1f} \|
	\| Attribution (Doc F1) \| {results['overall']['doc_f1']:.1f} \|
	\| Attribution (Page F1) \| {results['overall']['page_f1']:.1f} \|
	""")
	else:
	st.markdown(f"""
	\| Metric \| Value \|
	\|--------\|-------\|
	\| Overall ANLS* \| {results['overall']['anls']:.1f} \|
	\| Acc. Single-Hop (n={results['single_evidence']['n']}) \| {results['single_evidence']['anls']:.1f} \|
	\| Acc. Cross-Page (n={results['multi_evidence_same_doc']['n']}) \| {results['multi_evidence_same_doc']['anls']:.1f} \|
	\| Acc. Cross-Doc (n={results['multi_evidence_multi_doc']['n']}) \| {results['multi_evidence_multi_doc']['anls']:.1f} \|
	\| Attribution (Doc F1) \| {results['overall']['doc_f1']:.1f} \|
	\| Attribution (Page F1) \| {results['overall']['page_f1']:.1f} \|
	""")

	if results['n_unmatched'] > 0:
	with st.expander(f"{results['n_unmatched']} unmatched questions"):
	for q in results['unmatched_samples']:
	st.text(f"• {q}")
	if results['n_unmatched'] > 5:
	st.text(f"... and {results['n_unmatched'] - 5} more")

	# Step 2: Model Information
	st.markdown("---")
	st.markdown("#### Step 2: Model Information")

	col1, col2 = st.columns(2)

	with col1:
	model_name = st.text_input("Model Name *", placeholder="e.g., GPT-4o-Agent")
	organization = st.text_input("Organization *", placeholder="e.g., OpenAI")
	model_type = st.selectbox("Model Type *", options=["", "api", "open-weight"])

	with col2:
	description = st.text_area(
	"Description",
	placeholder="Brief description of your approach (e.g., 'Vision-language model with sparse search tool')",
	height=80
	)
	link = st.text_input("Link (Optional)", placeholder="https://arxiv.org/abs/... or https://github.com/...")
	selected_tags = st.multiselect(
	"Tags",
	options=AVAILABLE_TAGS,
	default=["Agentic"],
	help="Select tags that describe your approach"
	)

	# Step 3: Submit
	st.markdown("---")
	st.markdown("#### Step 3: Submit to Leaderboard")

	if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
	# Validate required fields
	submit_error = None
	if not model_name or not organization or not model_type:
	submit_error = "Please fill in all required fields (Model Name, Organization, Model Type)"
	elif "Agentic" in selected_tags and st.session_state.predictions:
	missing_effort = [
	(i + 1, p.get('question', '')[:60])
	for i, p in enumerate(st.session_state.predictions)
	if not _prediction_has_effort(p)
	]
	if missing_effort:
	samples = "; ".join(f"line {ln}: {q}..." for ln, q in missing_effort[:5])
	extra = f" (and {len(missing_effort) - 5} more)" if len(missing_effort) > 5 else ""
	submit_error = (
	f"Agentic submissions require effort data for every sample. "
	f"{len(missing_effort)} prediction(s) are missing effort information "
	f"(e.g. `iterations`, `steps`, `llm_calls`, `effort`, or `search_history`). "
	f"Examples: {samples}{extra}"
	)

	if submit_error:
	st.error(submit_error)
	else:
	# Get current user for submission tracking
	hf_user = get_hf_user()

	# Prepare submission data
	submission = {
	"model_name": model_name.strip(),
	"organization": organization.strip(),
	"description": description.strip() if description else "",
	"link": link.strip() if link else "",
	"tags": selected_tags,
	"submitted_by": hf_user['username'] if hf_user else "anonymous",
	"metadata": {
	"model_type": model_type,
	},
	"results": {
	"overall": {
	"semantic": results['overall'].get('semantic'),
	"semantic_ci": results['overall'].get('semantic_ci'),
	"anls": results['overall']['anls'],
	"page_f1": results['overall']['page_f1'],
	"doc_f1": results['overall']['doc_f1'],
	"kuiper": results['overall']['kuiper'],
	},
	"single_evidence": results['single_evidence'],
	"multi_evidence_same_doc": results['multi_evidence_same_doc'],
	"multi_evidence_multi_doc": results['multi_evidence_multi_doc'],
	"by_domain": results.get('by_domain', {}),
	},
	"submission_date": datetime.now(timezone.utc).isoformat(),
	}

	# Upload to HuggingFace Hub
	with st.spinner("Uploading to leaderboard..."):
	try:
	# Create path matching expected structure: {org}/{model}_results_{timestamp}.json
	safe_org = organization.strip().replace(" ", "_").replace("/", "-")
	safe_model = model_name.strip().replace(" ", "_").replace("/", "-")
	timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
	filename = f"{safe_model}_results_{timestamp}.json"
	path_in_repo = f"{safe_org}/{filename}"

	# Upload using HfApi
	api = HfApi()

	# Upload results JSON
	api.upload_file(
	path_or_fileobj=json.dumps(submission, indent=2).encode("utf-8"),
	path_in_repo=path_in_repo,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Add results for {organization}/{model_name}"
	)

	# Upload predictions file
	if st.session_state.get('predictions_raw'):
	predictions_filename = f"{safe_model}_predictions_{timestamp}.jsonl"
	predictions_path = f"{safe_org}/{predictions_filename}"
	api.upload_file(
	path_or_fileobj=st.session_state.predictions_raw.encode("utf-8"),
	path_in_repo=predictions_path,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Add predictions for {organization}/{model_name}"
	)

	st.success("Successfully submitted to leaderboard!")
	st.balloons()

	# Record submission for rate limiting
	record_submission(hf_user['username'])

	# Clear cache to force refresh on next load
	download_data.clear()
	load_eval_results.clear()

	# Clear form state
	st.session_state.eval_results = None
	st.session_state.predictions = None
	st.session_state.predictions_raw = None

	st.info("Your submission has been saved! The leaderboard will update shortly.")

	# Auto-refresh after a moment
	st.rerun(scope="app")

	except Exception as e:
	st.error(f"Upload failed: {str(e)}")
	st.warning("Please ensure HF_TOKEN environment variable is set with write access to the repository.")

	with st.expander("Submission JSON (for manual upload)"):
	st.code(json.dumps(submission, indent=2), language="json")

	st.info(f"""
	To submit manually:
	1. Copy the JSON above
	2. Save as `{path_in_repo}`
	3. Upload to `{RESULTS_REPO}` on HuggingFace Hub

	Or contact lukasz.borchmann@snowflake.com
	""")


	def get_all_submissions() -> list[dict]:
	"""Get all submission files with their metadata."""
	submissions = []
	results_path = Path(EVAL_RESULTS_PATH)

	if not results_path.exists():
	return submissions

	for org_dir in results_path.iterdir():
	if org_dir.is_dir() and not org_dir.name.startswith('.'):
	for result_file in org_dir.glob("_results_.json"):
	try:
	with open(result_file) as f:
	data = json.load(f)

	submission_date = data.get("submission_date")
	if not isinstance(submission_date, str):
	submission_date = ""
	submissions.append({
	"file_path": str(result_file),
	"relative_path": f"{org_dir.name}/{result_file.name}",
	"model_name": data.get("model_name", "Unknown"),
	"organization": data.get("organization", org_dir.name),
	"submitted_by": data.get("submitted_by", "Unknown"),
	"submission_date": submission_date,
	"accuracy": data.get("results", {}).get("overall", {}).get("anls", 0.0),
	"raw_json": json.dumps(data, indent=2),
	})
	except Exception as e:
	submissions.append({
	"file_path": str(result_file),
	"relative_path": f"{org_dir.name}/{result_file.name}",
	"model_name": "Error loading",
	"organization": org_dir.name,
	"submitted_by": "Unknown",
	"submission_date": "Unknown",
	"accuracy": 0.0,
	"raw_json": f"Error: {e}",
	})

	# Sort by submission date (newest first), fallback to empty string
	def _submission_sort_key(item: dict) -> str:
	date_val = item.get("submission_date")
	return date_val if isinstance(date_val, str) else ""

	submissions.sort(key=_submission_sort_key, reverse=True)
	return submissions


	def delete_submission_from_hub(relative_path: str) -> tuple[bool, str]:
	"""Delete a submission file from the HuggingFace Hub."""
	try:
	api = HfApi(token=TOKEN)
	api.delete_file(
	path_in_repo=relative_path,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	)
	return True, f"Successfully deleted {relative_path}"
	except Exception as e:
	return False, f"Failed to delete: {str(e)}"


	def update_submission_on_hub(relative_path: str, json_content: str) -> tuple[bool, str]:
	"""Update a submission file on HuggingFace Hub."""
	import tempfile

	try:
	# Validate JSON
	data = json.loads(json_content)

	# Create temp file with updated content
	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
	json.dump(data, f, indent=2)
	temp_path = f.name

	api = HfApi(token=TOKEN)
	api.upload_file(
	path_or_fileobj=temp_path,
	path_in_repo=relative_path,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Admin edit: {relative_path}"
	)

	os.unlink(temp_path) # Clean up
	return True, f"Successfully updated {relative_path}"
	except json.JSONDecodeError as e:
	return False, f"Invalid JSON: {str(e)}"
	except Exception as e:
	return False, f"Failed to update: {str(e)}"


	@st.fragment
	def admin_panel():
	"""Admin panel for managing submissions."""
	st.markdown("#### Admin Panel")
	st.markdown("Manage leaderboard submissions. Changes are permanent.")

	# Admin action buttons
	col1, col2 = st.columns(2)
	with col1:
	if st.button("Refresh Submissions", use_container_width=True):
	st.rerun()
	with col2:
	if st.button("Reload from HuggingFace", type="primary", use_container_width=True):
	# Clear all caches
	download_data.clear()
	load_eval_results.clear()

	# Delete local cached files to force fresh download
	if EVAL_RESULTS_PATH.exists():
	shutil.rmtree(EVAL_RESULTS_PATH)
	if EVAL_REQUESTS_PATH.exists():
	shutil.rmtree(EVAL_REQUESTS_PATH)

	# Re-download data
	with st.spinner("Re-downloading data from HuggingFace Hub..."):
	download_data()
	st.success("Leaderboard data reloaded from source!")
	st.rerun(scope="app")

	st.divider()

	submissions = get_all_submissions()

	if not submissions:
	st.info("No submissions found.")
	return

	st.markdown(f"{len(submissions)} submissions found")

	# Display each submission
	for i, sub in enumerate(submissions):
	with st.expander(f"{sub['model_name']} ({sub['organization']}) - {sub['submission_date'][:10] if len(sub['submission_date']) > 10 else sub['submission_date']}"):
	col1, col2 = st.columns([3, 1])

	with col1:
	st.markdown(f"""
	Model: {sub['model_name']}
	Organization: {sub['organization']}
	Submitted by: {sub['submitted_by']}
	Date: {sub['submission_date']}
	Accuracy: {sub['accuracy']:.1%}
	File: `{sub['relative_path']}`
	""")

	with col2:
	# Edit button
	if st.button("Edit", key=f"edit_{i}"):
	st.session_state[f"editing_{i}"] = True
	st.session_state[f"confirm_delete_{i}"] = False

	# Delete button with confirmation
	if st.button("Delete", key=f"delete_{i}", type="secondary"):
	st.session_state[f"confirm_delete_{i}"] = True
	st.session_state[f"editing_{i}"] = False

	if st.session_state.get(f"confirm_delete_{i}", False):
	st.warning("Are you sure?")
	col_yes, col_no = st.columns(2)
	with col_yes:
	if st.button("Yes", key=f"confirm_yes_{i}", type="primary"):
	success, message = delete_submission_from_hub(sub['relative_path'])
	if success:
	st.success(message)
	# Clear caches and refresh
	download_data.clear()
	load_eval_results.clear()
	st.session_state[f"confirm_delete_{i}"] = False
	st.rerun()
	else:
	st.error(message)
	with col_no:
	if st.button("No", key=f"confirm_no_{i}"):
	st.session_state[f"confirm_delete_{i}"] = False
	st.rerun()

	# Edit mode
	if st.session_state.get(f"editing_{i}", False):
	st.markdown("Edit JSON:")
	edited_json = st.text_area(
	"Edit submission JSON",
	value=sub['raw_json'],
	height=400,
	key=f"json_editor_{i}",
	label_visibility="collapsed"
	)

	col_save, col_cancel = st.columns(2)
	with col_save:
	if st.button("Save Changes", key=f"save_{i}", type="primary"):
	success, message = update_submission_on_hub(sub['relative_path'], edited_json)
	if success:
	st.success(message)
	# Clear caches and refresh
	download_data.clear()
	load_eval_results.clear()
	st.session_state[f"editing_{i}"] = False
	st.rerun()
	else:
	st.error(message)
	with col_cancel:
	if st.button("Cancel", key=f"cancel_{i}"):
	st.session_state[f"editing_{i}"] = False
	st.rerun()
	else:
	# Show raw JSON (read-only) - use checkbox instead of expander to avoid nesting
	if st.checkbox("Show JSON", key=f"show_json_{i}"):
	st.code(sub['raw_json'], language="json")

	# News management section
	st.divider()
	st.markdown("#### News Management")

	news_items = get_news()
	news_json = json.dumps(news_items, indent=2)

	with st.expander("Edit News (JSON)", expanded=False):
	st.markdown("""
	Format: Array of objects with `date` (YYYY-MM-DD) and `text` fields.
	```json
	[
	{"date": "2025-01-04", "text": "Your update message here"},
	...
	]
	```
	""")

	edited_news = st.text_area(
	"News JSON",
	value=news_json,
	height=300,
	key="news_editor",
	label_visibility="collapsed"
	)

	if st.button("Save News", type="primary"):
	try:
	parsed_news = json.loads(edited_news)
	if not isinstance(parsed_news, list):
	st.error("News must be a JSON array")
	else:
	success, message = save_news(parsed_news)
	if success:
	st.success(message)
	st.rerun()
	else:
	st.error(message)
	except json.JSONDecodeError as e:
	st.error(f"Invalid JSON: {e}")


	def main():
	# Handle OAuth callback (if returning from HuggingFace login)
	handle_oauth_callback()

	# Handle "analyze" query parameter from leaderboard
	analyze_model = st.query_params.get("analyze")
	if analyze_model:
	st.session_state.selected_model_for_analysis = unquote(analyze_model)
	st.session_state.go_to_analysis_tab = True
	# Clear the query param to avoid re-triggering
	st.query_params.clear()

	# Inject JavaScript to click on the Analysis tab
	import streamlit.components.v1 as components
	components.html("""
	<script>
	// Wait for Streamlit to render, then click Analysis tab
	function clickAnalysisTab() {
	const tabs = window.parent.document.querySelectorAll('[data-baseweb="tab"]');
	if (tabs.length > 1) {
	tabs[1].click(); // Analysis is the second tab (index 1)
	} else {
	// Retry if tabs not yet rendered
	setTimeout(clickAnalysisTab, 100);
	}
	}
	setTimeout(clickAnalysisTab, 200);
	</script>
	""", height=0)

	# Download data from HuggingFace Hub
	with st.spinner("Loading data from HuggingFace Hub..."):
	download_data()

	# Load data
	df = load_eval_results()

	# Check if admin user is logged in
	hf_user = get_hf_user()
	is_admin = hf_user and hf_user.get('username', '').lower() == 'borchmann'

	# Tabs - show Admin tab only for admin users
	if is_admin:
	tab1, tab2, tab3, tab4, tab5 = st.tabs(["Leaderboard", "Analysis", "About", "Submit Results", "Admin"])
	else:
	tab1, tab2, tab3, tab4 = st.tabs(["Leaderboard", "Analysis", "About", "Submit Results"])

	# ===== LEADERBOARD TAB =====
	with tab1:
	# Header with icon (fallback to emoji if icon doesn't load)
	if ICON_MEDAL:
	icon_html = f'<img src="{ICON_MEDAL}" style="width: 40px; height: 40px; vertical-align: middle; margin-right: 12px;" />'
	else:
	icon_html = f'<span style="font-size: 36px; margin-right: 12px;">🏆</span>'
	st.markdown(f'<h3 style="display: flex; align-items: center; margin-top: 1.5rem; margin-bottom: 1.2rem;">{icon_html} Leaderboard</h3>', unsafe_allow_html=True)

	if df.empty:
	st.warning("No evaluation results found. Submit your results to appear on the leaderboard!")
	else:
	# ===== FILTERS SIDE BY SIDE =====
	filter_col1, filter_col2 = st.columns(2)

	with filter_col1:
	# TAG FILTER - chips use MID_BLUE (darker, gradient start)
	tags_in_data = get_all_tags_from_df(df)
	all_available_tags = sorted(list(set(AVAILABLE_TAGS + tags_in_data)))

	selected_tags = st.multiselect(
	"Filter by techniques/features:",
	options=all_available_tags,
	default=[],
	placeholder="Click to filter by tags...",
	key="tag_filter",
	)

	with filter_col2:
	# COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
	# Mapping: short chip name -> full column name
	COLUMN_CHIP_NAMES = {
	"Accuracy": "Accuracy (LLM judge)",
	"Acc. Single-Hop": "Acc. Single-Hop",
	"Acc. Cross-Page": "Acc. Cross-Page",
	"Acc. Cross-Doc": "Acc. Cross-Doc",
	"ANLS": "ANLS (string)",
	"Attribution": "Attribution (Page F1)",
	"Attribution (Doc)": "Attribution (Doc F1)",
	"Effort": "Effort (Kuiper)",
	"Model Type": "Model Type",
	"Tags": "Tags",
	}
	# Reverse mapping for lookup
	CHIP_TO_COLUMN = COLUMN_CHIP_NAMES
	COLUMN_TO_CHIP = {v: k for k, v in COLUMN_CHIP_NAMES.items()}

	all_columns = list(df.columns)
	# Model and Organization are always visible (not in selector)
	always_visible = ["Model", "Organization"]
	# Hidden columns (used internally but not shown as separate columns)
	hidden_cols = ["Link", "Submission Date", "Description", "_by_domain", "_Accuracy_CI", "_Accuracy_SE"]
	# Full column names that are optional (Tags moved to end)
	optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
	optional_full_cols.append("Tags") # Add Tags at the end
	# Convert to chip names for display
	optional_chips = [COLUMN_TO_CHIP.get(c, c) for c in optional_full_cols]

	default_chips = ["Model Type", "Tags", "Accuracy", "Attribution", "Effort"]
	default_selected = [c for c in default_chips if c in optional_chips]

	selected_chips = st.multiselect(
	"Select columns to display:",
	options=optional_chips,
	default=default_selected,
	key="column_selector",
	)

	# Convert selected chips back to full column names
	selected_optional = [CHIP_TO_COLUMN.get(c, c) for c in selected_chips]

	# Apply tag filter
	filtered_df = filter_df_by_tags(df, selected_tags)

	# Show filter status
	if selected_tags:
	st.caption(f"Showing {len(filtered_df)} of {len(df)} models matching selected tags")

	# Model and Organization are always included first
	selected_columns = ["Model", "Organization"] + [c for c in optional_full_cols if c in selected_optional]

	# Initialize uncertainty mode in session state if not present
	if "uncertainty_mode" not in st.session_state:
	st.session_state.uncertainty_mode = "± SE"

	if selected_columns:
	# Render HTML table with proper styling
	render_leaderboard_table(filtered_df, selected_columns, uncertainty_mode=st.session_state.uncertainty_mode)

	# Bottom row: Uncertainty toggle (left) and Download button (right)
	st.markdown("") # Small spacing
	col1, col2 = st.columns([3, 1])

	with col1:
	st.radio(
	"Uncertainty:",
	options=["± SE", "90% CI", "95% CI", "None"],
	key="uncertainty_mode",
	horizontal=True,
	help="Display uncertainty estimates for accuracy and attribution metrics"
	)

	with col2:
	# Right-align the download button but keep its natural width
	st.markdown('''<style>
	.st-key-download_csv_btn {
	width: 100% !important;
	display: flex;
	justify-content: flex-end;
	}
	.st-key-download_csv_btn button {
	margin-left: auto !important;
	}
	</style>''', unsafe_allow_html=True)
	csv_df = build_csv_download_df(filtered_df, selected_columns, st.session_state.uncertainty_mode)
	csv = csv_df.to_csv(index=False)
	st.download_button(
	label="Download as CSV",
	data=csv,
	file_name="leaderboard.csv",
	mime="text/csv",
	key="download_csv_btn",
	)

	# News and Paper section (two columns)
	st.markdown("<br>", unsafe_allow_html=True) # Spacing
	news_col, paper_col = st.columns([2, 1])

	with news_col:
	st.markdown("<span style='font-size: 1rem; font-weight: normal;'>Updates</span>", unsafe_allow_html=True)
	news_items = get_news()[:NEWS_MAX_DISPLAY]
	if news_items:
	for item in news_items:
	date_str = item.get('date', '')
	text = item.get('text', '')
	# Use full date (YYYY-MM-DD)
	formatted_date = date_str[:10] if len(date_str) >= 10 else date_str
	st.caption(f"{formatted_date}: {text}")
	else:
	st.caption("No updates yet.")

	with paper_col:
	st.markdown("""
	<div style="text-align: right;">
	<a href="https://arxiv.org/abs/2603.12180" target="_blank" style="color: #9CA3AF; text-decoration: none;">Strategic Navigation or Stochastic Search?<br>How Agents and Humans Reason Over Document Collections</a>
	</div>
	""", unsafe_allow_html=True)

	# ===== VISUALIZATIONS TAB =====
	with tab2:
	if ICON_EYE:
	icon_html = f'<img src="{ICON_EYE}" style="width: 40px; height: 40px; vertical-align: middle; margin-right: 12px;" />'
	else:
	icon_html = f'<span style="font-size: 36px; margin-right: 12px;">📈</span>'
	st.markdown(f'<h3 style="display: flex; align-items: center; margin-top: 1.5rem; margin-bottom: 1.2rem;">{icon_html} Analysis</h3>', unsafe_allow_html=True)

	if df.empty:
	st.warning("No data available for visualization.")
	else:
	# Check if user came from leaderboard with a specific model
	if st.session_state.get('go_to_analysis_tab'):
	st.info(f"Showing analysis for: {st.session_state.get('selected_model_for_analysis', '')}")
	st.session_state.go_to_analysis_tab = False

	# Model details selector - at the top
	st.markdown("#### Model Details")

	model_names = df["Model"].tolist()
	# Use session state to allow setting model from leaderboard
	if 'selected_model_for_analysis' not in st.session_state:
	st.session_state.selected_model_for_analysis = model_names[0] if model_names else None

	# Ensure selected model exists in current data
	selected_index = 0
	if st.session_state.selected_model_for_analysis in model_names:
	selected_index = model_names.index(st.session_state.selected_model_for_analysis)

	selected_model = st.selectbox(
	"Select a model to view detailed breakdown:",
	model_names,
	index=selected_index,
	key="analysis_model_selector"
	)

	if selected_model:
	st.session_state.selected_model_for_analysis = selected_model
	show_model_details(selected_model)

	# Plots below
	st.markdown("---")
	st.markdown("#### Comparative Plots")

	# Two plots side by side
	col1, col2 = st.columns(2)

	with col1:
	fig_attribution = create_accuracy_vs_attribution_plot(df)
	st.plotly_chart(fig_attribution, width="stretch")

	with col2:
	fig_effort = create_accuracy_vs_effort_plot(df)
	st.plotly_chart(fig_effort, width="stretch")

	st.markdown("""
	Understanding the plots:
	- Each point represents a model submission
	- Orange points: API-based models
	- Blue points: Open-weight models
	- Hover over points to see model details
	- Left plot: Upper-right = high accuracy with good attribution (optimal)
	- Right plot: Upper-left = high accuracy with good effort calibration (optimal)
	""")

	# ===== ABOUT TAB =====
	with tab3:
	if ICON_DOCS:
	icon_html = f'<img src="{ICON_DOCS}" style="width: 40px; height: 40px; vertical-align: middle; margin-right: 12px;" />'
	else:
	icon_html = f'<span style="font-size: 36px; margin-right: 12px;">📖</span>'
	st.markdown(f'<h3 style="display: flex; align-items: center; margin-top: 1.5rem; margin-bottom: 1.2rem;">{icon_html} About</h3>', unsafe_allow_html=True)

	about_col1, about_col2 = st.columns(2)

	with about_col1:
	st.markdown("""
	#### MADQA Benchmark

	This benchmark evaluates AI systems on Agentic Document Collection Visual Question Answering —
	a task requiring systems to navigate, retrieve, reason over, and aggregate information from
	heterogeneous document collections.

	📄 [Read the paper: Strategic Navigation or Stochastic Search?](https://arxiv.org/abs/2603.12180)

	##### Dataset
	- 2,250 human-authored question-answer pairs
	- 800 multi-page PDF documents from diverse real-world domains
	- 18,619 total pages with rich visual layouts
	- 17.3% multi-hop questions (cross-page and cross-document)
	- 63 document categories across 13 high-level domains

	##### Task Properties
	The task is characterized by six formal properties:
	1. Extractive: Answers are drawn from evidence pages, not generated abstractly
	2. Multi-Hop: Evidence may span multiple disjoint pages requiring aggregation
	3. Closed-World: Answers must be derivable solely from the corpus
	4. Grounded: Answers must be faithfully attributed to minimal evidence
	5. Agentic: Requires iterative retrieval and reasoning (planning, navigation, aggregation)
	6. Visual: Answering may require non-textual information (layout, tables, figures)
	""")

	with about_col2:
	st.markdown("""
	#### Metrics

	##### Accuracy (LLM judge)
	- Accuracy (LLM judge): Primary metric combining ANLS* string matching with an LLM judge (G-Eval framework). Captures semantic correctness beyond exact string matching, with statistical bias correction
	- *ANLS (string)**: Pure string-based score using Average Normalized Levenshtein Similarity with optimal element alignment for lists/sets
	- Acc. Single-Hop: Accuracy on questions requiring a single evidence page
	- Acc. Cross-Page: Accuracy on multi-hop questions within the same document
	- Acc. Cross-Doc: Accuracy on multi-hop questions spanning multiple documents

	##### Attribution (Page F1)
	- Attribution (Page F1): F1 score measuring overlap between cited pages and gold evidence pages (penalizes both missing and spurious citations)
	- Attribution (Doc F1): Document-level attribution accuracy (whether the correct documents were identified)

	##### Effort (Kuiper)
	- Effort (Kuiper): Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries

	---

	Contact: [lukasz.borchmann@snowflake.com](mailto:lukasz.borchmann@snowflake.com)
	""")

	# ===== SUBMIT TAB =====
	with tab4:
	if ICON_WRITE:
	icon_html = f'<img src="{ICON_WRITE}" style="width: 40px; height: 40px; vertical-align: middle; margin-right: 12px;" />'
	else:
	icon_html = f'<span style="font-size: 36px; margin-right: 12px;">📝</span>'
	st.markdown(f'<h3 style="display: flex; align-items: center; margin-top: 1.5rem; margin-bottom: 1.2rem;">{icon_html} Submit Results</h3>', unsafe_allow_html=True)

	if not EVAL_AVAILABLE:
	st.warning("Evaluation module not available. Please install dependencies: `pip install anls-star datasets`")

	# Use fragment to prevent tab switch on file upload
	submit_results_fragment()

	# ===== ADMIN TAB (only for admin users) =====
	if is_admin:
	with tab5:
	admin_panel()


	if __name__ == "__main__":
	main()