Ko-AgentBench / tabs /leaderboard_v1_en.py
Ahnj-Stability's picture
fix bug: en version metric chart size
9aade7b
"""
Agent Leaderboard v1 - Main leaderboard interface
Updated implementation with LLM Type support and optimized radar charts
"""
import base64
import math
import re
from datetime import datetime
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
# Import components and styles from modular files
from components.leaderboard_components import (
get_chart_colors, get_rank_badge, get_type_badge,
get_metric_tooltip, get_responsive_styles, get_faq_section
)
from styles.leaderboard_styles import get_leaderboard_css
ASSET_ICON_PATH = Path("krew_icon.png")
KREW_ICON_BASE64 = ""
if ASSET_ICON_PATH.exists():
KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8")
CSV_PATH = Path("combined_evaluation_summary.csv")
if CSV_PATH.exists():
EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d")
else:
EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d")
def create_leaderboard_v2_tab():
"""Create the main leaderboard v1 tab with interactive table"""
token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens)
tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling
level_ids = [f"L{i}" for i in range(1, 8)]
level_tsq_sources = {
"L1": "L1_ArgAcc",
"L2": "L2_SelectAcc",
"L3": "L3_PSM",
"L4": "L4_Coverage",
"L5": "L5_AdaptiveRoutingScore",
"L6": "L6_EffScore",
"L7": "L7_ContextRetention",
}
def load_leaderboard_data():
"""Load and prepare the leaderboard data"""
df = pd.read_csv('combined_evaluation_summary.csv')
# Clean and prepare data
df = df.copy()
numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')]
for col in numeric_candidate_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Derive per-level helper columns for cost and turns
sr_columns = []
tsq_columns = []
duration_columns = []
cost_columns = []
turns_columns = []
for level in level_ids:
sr_col = f"{level}_SR"
if sr_col in df.columns:
sr_columns.append(sr_col)
df[sr_col] = df[sr_col].round(3)
tsq_source = level_tsq_sources.get(level)
if tsq_source and tsq_source in df.columns:
tsq_columns.append(tsq_source)
duration_col = f"{level}_Avg_Exec_Time"
if duration_col in df.columns:
duration_columns.append(duration_col)
token_col = f"{level}_Avg_Tokens"
if token_col in df.columns:
cost_col = f"{level}_Avg_Cost"
turns_col = f"{level}_Avg_Turns"
df[cost_col] = df[token_col] * token_to_cost_factor
df[turns_col] = df[token_col] / tokens_per_turn
cost_columns.append(cost_col)
turns_columns.append(turns_col)
if sr_columns:
df['Avg AC'] = df[sr_columns].mean(axis=1)
if tsq_columns:
df['Avg TSQ'] = df[tsq_columns].mean(axis=1)
if cost_columns:
df['Avg Total Cost'] = df[cost_columns].mean(axis=1)
if duration_columns:
df['Avg Session Duration'] = df[duration_columns].mean(axis=1)
if turns_columns:
df['Avg Turns'] = df[turns_columns].mean(axis=1)
# Derive core capability metrics for radar visualization
if sr_columns:
df['Overall Success'] = df[sr_columns].mean(axis=1)
execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns]
if execution_cols:
df['Execution Accuracy'] = df[execution_cols].mean(axis=1)
reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns]
if reasoning_cols:
df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1)
robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns]
if robustness_cols:
df['Robustness'] = df[robustness_cols].mean(axis=1)
context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns]
if context_cols:
df['Context & Efficiency'] = df[context_cols].mean(axis=1)
epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns]
if epr_cols:
df['Call Validity'] = df[epr_cols].mean(axis=1)
# Use LLM Type from CSV directly, with mapping to display names
if 'LLM Type' in df.columns:
# Clean the LLM Type column to remove any whitespace
df['LLM Type'] = df['LLM Type'].astype(str).str.strip()
# Map LLM Type to Model Type
def map_llm_type(llm_type):
if llm_type.upper() == "OSS":
return "Open source"
else:
return "Proprietary"
df['Model Type'] = df['LLM Type'].apply(map_llm_type)
else:
# Fallback to vendor mapping if LLM Type column doesn't exist
vendor_model_type_map = {
"OpenAI": "Proprietary",
"Anthropic": "Proprietary",
"Google": "Proprietary",
"Microsoft": "Proprietary",
"Mistral": "Proprietary",
"Databricks": "Open source",
"Meta": "Open source",
"Alibaba": "Open source",
"μ•Œλ¦¬λ°”λ°”": "Open source", # Korean name for Alibaba
"Kakao": "Open source",
"SKT": "Open source",
"KT": "Open source",
"xAI": "Proprietary",
}
df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary')
# Round numeric columns for better display
round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity']
round_one_cols = ['Avg Session Duration', 'Avg Turns']
for col in round_three_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
for col in round_one_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(1)
if cost_columns:
df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3)
if turns_columns:
df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2)
if duration_columns:
df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2)
# Fill NaN values appropriately
df = df.fillna('')
return df
def build_static_radar_chart(values, labels):
"""Render a small static radar chart as inline SVG"""
if not values or all(v == 0 for v in values):
return """
<div class="radar-placeholder">
<span>Radar Chart</span>
<small>Execution Accuracy Β· Complex Reasoning Β· Robustness Β· Context & Efficiency Β· Overall Success Β· Validity</small>
</div>
"""
size = 220
center = size / 2
radius = size * 0.38
n = len(values)
def point(v, idx, scale=1.0):
angle = (2 * math.pi * idx / n) - math.pi / 2
r = radius * v * scale
x = center + r * math.cos(angle)
y = center + r * math.sin(angle)
return x, y
polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values)))
ring_polygons = []
for step in (0.33, 0.66, 1.0):
ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n)))
opacity = 0.04 if step < 1.0 else 0.08
ring_polygons.append(f'<polygon points="{ring_points}" fill="rgba(245,246,247,{opacity})" stroke="rgba(148,163,184,0.35)" stroke-width="1" />')
axis_lines = "\n".join(
f'<line x1="{center:.2f}" y1="{center:.2f}" x2="{point(1, idx)[0]:.2f}" y2="{point(1, idx)[1]:.2f}" stroke="rgba(148,163,184,0.35)" stroke-width="1" />'
for idx in range(n)
)
label_spans = "\n".join(
f'<text x="{point(1.1, idx)[0]:.2f}" y="{point(1.1, idx)[1]:.2f}" text-anchor="middle" dominant-baseline="middle" font-size="9" fill="white">{label}</text>'
for idx, label in enumerate(labels)
)
svg = f"""
<svg width="{size}" height="{size}" viewBox="0 0 {size} {size}" xmlns="http://www.w3.org/2000/svg" role="img">
<defs>
<radialGradient id="radarGlow" cx="50%" cy="50%" r="50%">
<stop offset="0%" stop-color="rgba(255,210,30,0.25)" />
<stop offset="100%" stop-color="rgba(255,210,30,0.0)" />
</radialGradient>
</defs>
<rect width="{size}" height="{size}" fill="url(#radarGlow)" opacity="0.2" />
{''.join(ring_polygons)}
{axis_lines}
<polygon points="{polygon_points}" fill="rgba(255,210,30,0.35)" stroke="#ffd21e" stroke-width="2" />
{label_spans}
</svg>
"""
return svg
# Level metadata for the 7-stage task framework
level_details = {
"ALL": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>ALL Β· All Tasks</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>First, observe the overall average performance across all seven tasks. This average should then be utilized as a baseline to conduct a more detailed per-level comparison.</span>"
},
"L1": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L1 Β· Single Tool Call</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates single tool invocation capability and basic command execution accuracy.</span>"
},
"L2": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L2 Β· Tool Selection</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Measures the ability to choose the right tool and invoke it with appropriate parameters.</span>"
},
"L3": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L3 Β· Sequential Tool Reasoning</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Validates multi-step sequential reasoning for solving tasks.</span>"
},
"L4": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L4 Β· Parallel Tool Reasoning</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates the ability to integrate and summarize information from multiple sources in parallel.</span>"
},
"L5": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L5 Β· Error Handling & Robustness</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Checks awareness of unexpected failures and the strategies used to recover.</span>"
},
"L6": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L6 Β· Efficient Tool Utilization</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Examines operational efficiency in achieving goals with minimal calls and cost.</span>"
},
"L7": {
"title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L7 Β· Long-Context Memory</span>",
"description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Analyzes the ability to retain and leverage long conversational context.</span>"
}
}
default_level = "ALL"
sr_column_map = {level: f"{level}_SR" for level in level_ids}
overall_sort_column = "Overall Success"
def resolve_level(level_value):
"""Normalize the incoming level filter value"""
if not level_value:
return default_level
return level_value if level_value in level_details else default_level
def generate_html_table(filtered_df, highlight_column):
"""Generate styled HTML table with per-level success rates"""
valid_highlights = list(sr_column_map.values()) + ["Overall Success"]
highlight_column = highlight_column if highlight_column in valid_highlights else None
overall_column = "Overall Success"
overall_highlight = (highlight_column == overall_column)
highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids}
table_html = """
<style>
/* Dark theme table styling */
.v2-table-container {
background: var(--bg-card);
border-radius: 16px;
overflow: hidden;
border: 1px solid var(--border-subtle);
margin-top: 20px;
}
.v2-styled-table {
width: 100%;
border-collapse: collapse;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
background: var(--bg-card);
color: #FFFFFF;
}
.v2-styled-table thead {
position: sticky;
top: 0;
background: rgba(255, 210, 30, 0.1);
z-index: 1;
}
.v2-styled-table th {
padding: 14px 12px;
text-align: left;
font-weight: 600;
color: #FFFFFF;
border-bottom: 2px solid var(--accent-primary);
font-size: 13px;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.v2-styled-table th.numeric-cell {
text-align: center;
}
.v2-styled-table td {
padding: 12px;
border-bottom: 1px solid var(--border-subtle);
color: #FFFFFF;
transition: all 0.2s ease;
}
.v2-styled-table tbody tr {
transition: all 0.3s ease;
}
.v2-styled-table tbody tr:hover {
background: rgba(255, 210, 30, 0.15) !important;
box-shadow: 0 0 20px rgba(255, 210, 30, 0.3), inset 0 0 20px rgba(255, 210, 30, 0.1);
transform: scale(1.01);
}
.v2-styled-table tbody tr:nth-child(even) {
background: var(--bg-secondary);
}
.model-name {
font-weight: 500;
color: #FFFFFF;
transition: color 0.2s ease;
}
/* Keep model name color consistent on hover to emphasize row highlight */
.v2-styled-table tr:hover .model-name {
color: #FFFFFF;
}
.numeric-cell {
font-family: 'Geist Mono', monospace;
font-size: 13px;
text-align: center;
color: #FFFFFF;
}
.highlight-header {
background: rgba(255, 210, 30, 0.14);
color: #FFFFFF;
}
.highlight-cell {
background: rgba(255, 210, 30, 0.08);
color: #FFFFFF;
font-weight: 600;
}
</style>
<div class="v2-table-container">
<table class="v2-styled-table">
<thead>
<tr>
<th style="width: 80px;">Rank</th>
<th>Model</th>
<th>Vendor</th>
<th style="width: 120px;">LLM Type</th>
"""
overall_header_classes = ["numeric-cell"]
if overall_highlight:
overall_header_classes.append("highlight-header")
table_html += f"""
<th class="{' '.join(overall_header_classes)}" title="Average success rate across all levels">
<span class="metric-header">Overall <span class="info-icon">β“˜</span></span>
</th>
"""
for level in level_ids:
header_classes = ["numeric-cell"]
if highlight_map.get(level):
header_classes.append("highlight-header")
table_html += f"""
<th class="{' '.join(header_classes)}" title="Average success rate for {level}">
<span class="metric-header">{level} <span class="info-icon">β“˜</span></span>
</th>
"""
table_html += """
</tr>
</thead>
<tbody>
"""
def safe_float(value):
if value is None:
return ''
if isinstance(value, str) and value.strip() == '':
return ''
if pd.isna(value):
return ''
try:
return float(value)
except (TypeError, ValueError):
return ''
# Generate table rows
for idx, (_, row) in enumerate(filtered_df.iterrows()):
rank = idx + 1
table_html += f"""
<tr>
<td>{get_rank_badge(rank)}</td>
<td class="model-name">{row['Model']}</td>
<td>{row['Vendor']}</td>
<td>{get_type_badge(row['Model Type'])}</td>
"""
overall_value = safe_float(row.get(overall_column, ''))
if overall_value != '':
overall_display = f'{overall_value:.3f}'
else:
overall_display = '-'
overall_classes = ["numeric-cell"]
if overall_highlight:
overall_classes.append("highlight-cell")
table_html += f'<td class="{" ".join(overall_classes)}">{overall_display}</td>'
for level in level_ids:
sr_col = sr_column_map[level]
value = safe_float(row.get(sr_col, ''))
if value != '':
value_display = f'{value:.3f}'
else:
value_display = '-'
cell_classes = ["numeric-cell"]
if highlight_map.get(level):
cell_classes.append("highlight-cell")
table_html += f'<td class="{" ".join(cell_classes)}">{value_display}</td>'
table_html += "</tr>"
table_html += """
</tbody>
</table>
</div>
"""
return table_html
def update_leaderboard_title(level_filter):
"""Update the leaderboard title based on selected level"""
level_key = resolve_level(level_filter)
level_info = level_details.get(level_key, level_details[default_level])
level_title = level_info["title"]
level_description = level_info["description"]
return f"""
<div class="domain-selector-container leaderboard-intro">
<div class="domain-header">
<h2 class="domain-title" >Agent Leaderboard Β· {level_title}</h2>
<p class="domain-subtitle" >{level_description}</p>
</div>
<div class="dataframe-container">
"""
model_type_lookup = {
"OSS": "Open source",
"API": "Proprietary"
}
def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"):
"""Apply shared filters and sorting to the leaderboard dataframe."""
filtered_df = df.copy()
level_key = resolve_level(level_filter)
highlight_column = None
if model_type_filter != "All":
mapped_type = model_type_lookup.get(model_type_filter, model_type_filter)
filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type]
actual_sort_column = sort_by if sort_by in filtered_df.columns else None
if not actual_sort_column:
if level_key == "ALL":
actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None
else:
actual_sort_column = sr_column_map.get(level_key)
if level_key in sr_column_map:
highlight_column = sr_column_map[level_key]
elif level_key == "ALL" and overall_sort_column in filtered_df.columns:
highlight_column = overall_sort_column
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
return filtered_df, level_key, highlight_column
def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order):
"""Filter and sort the leaderboard data"""
df = load_leaderboard_data()
filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by)
# Generate HTML table
return generate_html_table(filtered_df, highlight_column)
# Load initial data
initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
initial_df = load_leaderboard_data() # Load raw data for model selector
if not initial_df.empty:
overall_success_numeric = pd.to_numeric(initial_df.get('Overall Success'), errors='coerce')
if overall_success_numeric.notna().any():
initial_df = initial_df.assign(**{'Overall Success': overall_success_numeric}).sort_values(
'Overall Success', ascending=False, na_position='last'
)
else:
initial_df = initial_df.sort_values('Model')
initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
initial_level_metric_level = level_ids[0] if level_ids else None
initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else []
initial_level_model_values = initial_level_model_choices[:5]
initial_level_metric_chart = create_level_metric_chart(
initial_df,
initial_level_metric_level,
initial_level_model_values
) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available")
# Load custom CSS and responsive styles
custom_css = get_leaderboard_css() + get_responsive_styles() + """
<style>
/* Page-specific styles for leaderboard v2 */
/* Metric header styles with info icons */
.metric-header {
cursor: help;
display: inline-flex;
align-items: center;
gap: 6px;
}
.info-icon {
color: var(--accent-secondary);
font-size: 1em;
opacity: 0.8;
transition: opacity 0.2s ease;
font-weight: normal;
}
.metric-header:hover .info-icon {
opacity: 1;
}
/* Native tooltip styling */
.v2-styled-table th[title] {
cursor: help;
}
/* Custom tooltip using CSS only */
[data-tooltip] {
position: relative;
cursor: help;
}
[data-tooltip]::before {
content: attr(data-tooltip);
position: absolute;
bottom: 100%;
left: 50%;
transform: translateX(-50%);
background: rgba(26, 26, 46, 0.95);
color: #f5f6f7;
padding: 8px 12px;
border-radius: 6px;
font-size: 12px;
white-space: nowrap;
max-width: 300px;
z-index: 10000;
opacity: 0;
pointer-events: none;
transition: opacity 0.3s;
margin-bottom: 5px;
border: 1px solid rgba(16, 152, 247, 0.3);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.8);
}
[data-tooltip]:hover::before {
opacity: 0.8;
}
/* Dark theme table styling */
.v2-table-container {
background: var(--bg-card);
border-radius: 16px;
overflow: visible; /* Changed from hidden to visible for tooltips */
border: 1px solid var(--border-subtle);
margin-top: 20px;
position: relative;
}
.v2-styled-table {
width: 100%;
border-collapse: collapse;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
background: var(--bg-card);
color: #FFFFFF;
}
.v2-styled-table thead {
position: sticky;
top: 0;
background: rgba(255, 210, 30, 0.1);
z-index: 1;
}
.v2-styled-table th {
padding: 14px 12px;
text-align: left;
font-weight: 600;
color: #FFFFFF;
border-bottom: 2px solid var(--accent-primary);
font-size: 14px;
text-transform: uppercase;
letter-spacing: 0.05em;
position: relative; /* Added for tooltip positioning */
}
.v2-styled-table td {
padding: 12px;
border-bottom: 1px solid var(--border-subtle);
color: #FFFFFF;
font-size: 14px;
transition: all 0.2s ease;
}
.v2-styled-table tbody tr {
transition: all 0.3s ease;
}
.v2-styled-table tbody tr:hover {
background: rgba(255, 210, 30, 0.15) !important;
box-shadow: 0 0 20px rgba(255, 210, 30, 0.3), inset 0 0 20px rgba(255, 210, 30, 0.1);
transform: scale(1.01);
}
.v2-styled-table tbody tr:nth-child(even) {
background: var(--bg-secondary);
}
.model-name {
font-weight: 500;
color: var(--accent-primary);
font-size: 14px;
transition: color 0.2s ease;
}
.v2-styled-table tr:hover .model-name {
color: var(--accent-secondary);
}
.numeric-cell {
font-family: 'Geist Mono', monospace;
font-size: 14px;
text-align: center;
}
</style>
<script>
// Function to update radio button styling
function updateRadioStyling() {
// Remove selected class from all labels first
document.querySelectorAll('.selected').forEach(function(label) {
label.classList.remove('selected');
});
document.querySelectorAll('.domain-radio label').forEach(function(label) {
label.style.background = '';
label.style.borderColor = '';
label.style.transform = '';
label.style.fontWeight = '';
});
// Apply selected class to checked radio buttons
document.querySelectorAll('input[type="radio"]:checked').forEach(function(input) {
var label = input.closest('label');
if (label) {
label.classList.add('selected');
// For domain radio buttons, apply special styling
if (label.closest('.domain-radio')) {
label.style.background = '#ffd21e33';
label.style.borderColor = 'var(--accent-primary)';
label.style.transform = 'scale(1.05)';
label.style.fontWeight = '600';
}
}
});
}
// Wait for Gradio to initialize
function initializeRadioStyles() {
updateRadioStyling();
// Create observer to watch for changes
var observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.type === 'attributes' && mutation.attributeName === 'checked') {
updateRadioStyling();
}
});
});
// Observe all radio inputs
document.querySelectorAll('input[type="radio"]').forEach(function(radio) {
observer.observe(radio, { attributes: true });
});
}
// Try multiple initialization strategies
document.addEventListener('DOMContentLoaded', function() {
setTimeout(initializeRadioStyles, 100);
setTimeout(initializeRadioStyles, 500);
setTimeout(initializeRadioStyles, 1000);
});
// Also check when window loads
window.addEventListener('load', function() {
setTimeout(initializeRadioStyles, 100);
});
// Listen for Gradio's custom events
document.addEventListener('gradio:loaded', initializeRadioStyles);
</script>
"""
gr.HTML(custom_css)
# Header styles and navigation
gr.HTML("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
/* Enhanced button styling with better gradio compatibility */
.header-action-button {
display: inline-block !important;
padding: 14px 28px !important;
background: #ffd21e !important;
color: #FFFFFF !important;
text-decoration: none !important;
border-radius: 16px !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 700 !important;
font-size: 1.1rem !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
border: none !important;
cursor: pointer !important;
box-shadow: 0 8px 24px rgba(255, 210, 30, 0.4), 0 4px 12px rgba(0, 0, 0, 0.3) !important;
position: relative !important;
overflow: hidden !important;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.35) !important;
}
.header-action-button::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
transition: left 0.6s;
}
.header-action-button:hover::before {
left: 100%;
}
.header-action-button:hover {
transform: translateY(-3px) !important;
box-shadow: 0 12px 32px rgba(255, 210, 30, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
background: #ffd21e !important;
color: #FFFFFF !important;
text-decoration: none !important;
text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important;
}
.header-action-button:active {
transform: translateY(-1px) !important;
}
.action-button-icon {
font-size: 1.2rem !important;
margin-right: 8px !important;
filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3));
}
.hero-banner-wrapper {
position: relative;
width: 100vw;
margin: 0 calc(-50vw + 50%) 20px calc(-50vw + 50%);
border-radius: 0 !important;
overflow: hidden !important;
box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25) !important;
}
.hero-banner-wrapper::before {
content: "";
position: absolute;
inset: 0;
background: #01091A;
z-index: 0;
}
#hero-banner {
position: relative;
width: 100% !important;
height: auto !important;
z-index: 1;
}
#hero-banner img {
width: 100% !important;
height: auto !important;
display: block !important;
object-fit: cover !important;
}
.hero-title {
font-size: 10rem;
font-weight: 800;
line-height: 1.1;
background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 1rem;
font-family: 'Nanum Gothic', sans-serif !important;
}
.hero-subtitle {
color: var(--text-secondary);
font-size: 3rem;
font-family: 'Nanum Gothic', sans-serif !important;
margin-top: 0;
}
.hero-actions {
display: flex;
justify-content: center;
align-items: center;
gap: 16px;
flex-wrap: wrap;
margin: 32px 0;
padding: 0 20px;
}
.hero-action-button {
display: inline-flex !important;
align-items: center !important;
gap: 12px !important;
padding: 10px 16px !important;
background: rgba(245, 246, 247, 0.06) !important;
border: 1px solid var(--border-subtle) !important;
border-radius: 999px !important;
color: #FFFFFF !important;
text-decoration: none !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
transition: all 0.3s ease !important;
backdrop-filter: blur(10px) !important;
-webkit-backdrop-filter: blur(10px) !important;
}
.hero-action-button:hover {
transform: translateY(-2px) !important;
border-color: var(--accent-primary) !important;
background: rgba(255, 210, 30, 0.12) !important;
text-decoration: none !important;
}
.hero-action-button svg {
width: 20px;
height: 20px;
}
.hero-action-button span {
font-weight: 600;
letter-spacing: 0.01em;
}
.dashboard-section {
margin: 48px auto 0 auto;
max-width: 1100px;
padding: 40px;
background: rgba(245, 246, 247, 0.06);
border: 1px solid var(--border-subtle);
border-radius: 24px;
box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
font-family: 'Nanum Gothic', sans-serif !important;
}
.dashboard-section.emphasized {
background: #ffd21e26;
border-color: rgba(255, 210, 30, 0.6);
box-shadow: 0 24px 50px rgba(255, 210, 30, 0.25);
}
.dashboard-section .section-header {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
text-align: center;
gap: 12px;
margin-bottom: 32px;
}
.section-title {
font-size: 3.75rem;
font-weight: 700;
color: #FFFFFF;
margin-bottom: 12px;
text-align: center !important;
font-family: 'Nanum Gothic', sans-serif !important;
}
.section-lead, .section-subtitle {
font-size: 1.32rem !important;
color: var(--text-secondary);
max-width: 720px;
margin: 0 auto 24px auto;
line-height: 1.7;
text-align: center !important;
word-break: keep-all;
white-space: normal;
display: block;
font-family: 'Nanum Gothic', sans-serif !important;
}
.phase-grid {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 24px;
}
.phase-card {
padding: 28px;
border-radius: 20px;
border: 1px solid var(--border-subtle);
background: rgba(1, 9, 26, 0.65);
box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.03);
}
.phase-card h3 {
font-size: 1.44rem !important;
color: #FFFFFF;
margin-bottom: 20px;
font-weight: 700;
font-family: 'Nanum Gothic', sans-serif !important;
}
.phase-chart {
width: 120px;
height: 120px;
border-radius: 50%;
background: conic-gradient(var(--accent-primary) var(--progress), rgba(255, 210, 30, 0.15) var(--progress));
display: flex;
align-items: center;
justify-content: center;
margin-bottom: 24px;
margin-left: auto;
margin-right: auto;
position: relative;
}
.phase-chart::after {
content: '';
position: absolute;
width: 80px;
height: 80px;
border-radius: 50%;
background: rgba(1, 9, 26, 0.95);
}
.phase-chart span {
position: relative;
font-size: 1.2rem !important;
font-weight: 700;
color: #FFFFFF !important;
font-family: 'Nanum Gothic', sans-serif !important;
}
/* Additional specific selectors */
.phase-card .phase-chart span {
color: #FFFFFF !important;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important;
font-family: 'Nanum Gothic', sans-serif !important;
}
.phase-grid .phase-chart span {
color: #FFFFFF !important;
z-index: 10 !important;
font-family: 'Nanum Gothic', sans-serif !important;
}
.phase-list {
list-style: none;
padding: 0;
margin: 0;
display: grid;
gap: 12px;
}
.phase-list li {
padding: 12px 16px;
border-radius: 12px;
background: rgba(245, 246, 247, 0.05);
border: 1px solid rgba(245, 246, 247, 0.08);
color: var(--text-secondary);
font-size: 1.08rem !important;
font-family: 'Nanum Gothic', sans-serif !important;
}
.scenario-body {
max-width: 760px;
margin: 0 auto;
text-align: center;
}
.scenario-body p {
font-size: 1.05rem;
line-height: 1.7;
color: var(--text-secondary);
margin-bottom: 32px;
}
.section-flow {
display: flex;
justify-content: center;
align-items: center;
margin-top: 24px;
color: var(--accent-primary);
font-size: 2rem;
}
.criteria-grid {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 24px;
}
.criteria-card {
padding: 24px;
border-radius: 20px;
border: 1px solid var(--border-subtle);
background: rgba(1, 9, 26, 0.7);
display: flex;
flex-direction: column;
gap: 16px;
box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.03);
}
.criteria-card h3 {
font-size: 1.25rem;
font-weight: 700;
color: #FFFFFF;
margin: 0;
}
.criteria-card ul {
list-style: disc;
margin: 0;
padding-left: 20px;
color: var(--text-secondary);
display: grid;
gap: 10px;
font-size: 0.95rem;
line-height: 1.5;
}
/* Responsive design */
@media (max-width: 768px) {
.hero-title {
font-size: 10rem;
}
.hero-action-button {
width: 100% !important;
justify-content: center !important;
}
.dashboard-section {
padding: 28px;
margin: 32px 16px 0 16px;
}
.phase-grid {
grid-template-columns: 1fr;
}
.criteria-grid {
grid-template-columns: 1fr;
}
}
@media (max-width: 480px) {
.hero-actions {
flex-direction: column;
gap: 8px;
}
.section-title {
font-size: 2.7rem;
}
.phase-chart {
width: 100px;
height: 100px;
}
.phase-chart::after {
width: 68px;
height: 68px;
}
}
</style>
""")
gr.HTML("<div class='hero-banner-wrapper'>")
gr.Image(
value="banner_wide.png",
show_label=False,
interactive=False,
type="filepath",
elem_id="hero-banner"
)
gr.HTML("</div>")
gr.HTML("""
<div style="text-align: center; padding: 20px 0;">
<h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
<p class="hero-subtitle">Agent benchmark optimized for real Korean usage.</p>
</div>
""")
# Links section below title
gr.HTML("""
<div class="hero-actions">
<a href="https://hugging-face-krew.github.io/" target="_blank" rel="noopener noreferrer" class="hero-action-button">
<svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
<line x1="8" y1="12" x2="16" y2="12"/>
</svg>
<span>Blog</span>
</a>
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
<svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M9 19c-5 1.5-5-2.5-7-3"/>
<path d="M20 21v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
</svg>
<span>GitHub</span>
</a>
<a href="https://huggingface.co/datasets/huggingface-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
<svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
<polyline points="7 10 12 15 17 10"/>
<line x1="12" y1="15" x2="12" y2="3"/>
</svg>
<span>Dataset</span>
</a>
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench/blob/main/README_en.md#-metrics" target="_blank" rel="noopener noreferrer" class="hero-action-button">
<svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M3 3v18h18"/>
<path d="M7 17v-6"/>
<path d="M12 17V7"/>
<path d="M17 17v-3"/>
</svg>
<span>Metrics</span>
</a>
</div>
""")
# Section 1: Task Design by Stage
gr.HTML("""
<div class="dashboard-section">
<div class="section-header">
<h2 class="section-title" style="font-family: 'Nanum Gothic', sans-serif; font-size: 2.5rem;">7-Level Task Design</h2>
</div>
<p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">We analyzed agent capabilities across seven stagesβ€”from simple tool calls to long-context retention and robustness.</p>
<div class="phase-grid">
<div class="phase-card">
<h3>Single Turn</h3>
<div class="phase-chart" style="--progress:80%;">
<span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">80%</span>
</div>
<ul class="phase-list">
<li style="color: #FFFFFF;">L1: Single Tool Call</li>
<li style="color: #FFFFFF;">L2: Tool Selection</li>
<li style="color: #FFFFFF;">L3: Sequential Tool Reasoning</li>
<li style="color: #FFFFFF;">L4: Parallel Tool Reasoning</li>
<li style="color: #FFFFFF;">L5: Error Handling & Robustness</li>
</ul>
</div>
<div class="phase-card">
<h3>Multi Turn</h3>
<div class="phase-chart" style="--progress:20%;">
<span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">20%</span>
</div>
<ul class="phase-list">
<li style="color: #FFFFFF;">L6: Efficient Tool Utilization</li>
<li style="color: #FFFFFF;">L7: Long-Context Memory</li>
</ul>
</div>
</div>
</div>
""")
# Section 2: Core Scenario Design
gr.HTML("""
<div class="dashboard-section emphasized">
<div class="section-header">
<h2 class="section-title" style="font-size: 2.0rem;">High-quality scenario design tailored to 18 Korea-specific APIs and real-world use cases.</h2>
</div>
<div class="scenario-body">
<p style="color: var(--text-primary);">We built realistic scenariosβ€”such as appointment booking and blog review searchβ€”by integrating APIs widely used in Korea including Naver Maps, Kakao services, and local websites.</p>
</div>
</div>
<div class="section-flow">βŒ„</div>
""")
# Section 3: Key Evaluation Criteria
gr.HTML("""
<div class="dashboard-section">
<div class="section-header">
<h2 class="section-title" style="font-size: 2.0rem;">Key Evaluation Criteria</h2>
</div>
<div class="criteria-grid">
<div class="criteria-card">
<h3>Cache-based Iterative Evaluation</h3>
<ul>
<li>Improved handling of failed API responses</li>
<li>Addresses chronic benchmark issues such as mismatched response attributes</li>
<li>Ensures benchmark consistency and reliability</li>
</ul>
</div>
<div class="criteria-card">
<h3>Robustness Testing</h3>
<ul>
<li>Evaluates recognition and response strategies for intentional failure scenarios (e.g., discontinued products)</li>
<li>Surfaces models that remain stable in real-world deployments</li>
</ul>
</div>
<div class="criteria-card">
<h3>Level-specific Precision Metrics</h3>
<ul>
<li>Evaluates each phase of problem solving, including tool selection, parameter setup, and data flow</li>
<li>Quantitatively identifies model strengths and weaknesses</li>
</ul>
</div>
</div>
</div>
""")
# Metrics overview cards removed per updated design
# Domain filter section with enhanced styling
gr.HTML("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
/* Enhanced domain selector styling */
.domain-selector-container {
background: #ffd21e0d;
border-radius: 20px;
padding: 32px;
margin-bottom: 32px;
border: 1px solid var(--border-subtle);
position: relative;
overflow: visible;
box-shadow:
0 8px 32px rgba(0, 0, 0, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.05);
}
.domain-selector-container.leaderboard-intro {
padding-bottom: 0;
margin-bottom: 32px;
}
.leaderboard-intro .domain-header {
margin-bottom: 20px;
}
.leaderboard-intro .domain-subtitle {
font-size: 1.1rem;
max-width: 720px;
margin: 0 auto;
}
.leaderboard-intro .dataframe-container {
margin: 16px -32px -32px;
padding: 0 32px 32px;
background: transparent;
border-radius: 0 0 20px 20px;
}
.domain-performance-container {
margin-bottom: 32px;
}
.domain-performance-container .domain-header {
margin-bottom: 24px;
}
.domain-performance-container .domain-subtitle {
font-size: 1.05rem;
max-width: 720px;
margin: 0 auto;
}
.leaderboard-intro .domain-title,
.domain-performance-container > .domain-header .domain-title,
.performance-card-container > .domain-header .domain-title {
font-size: 2.6rem !important;
}
.performance-card-content {
display: flex;
flex-direction: column;
gap: 24px;
align-items: stretch;
}
.performance-card-display {
flex: 1;
min-width: 0;
}
.performance-card-container {
margin-top: 32px;
}
.performance-card-container .domain-header {
margin-bottom: 24px;
}
.performance-card-container .domain-subtitle {
font-size: 1.05rem;
max-width: 720px;
margin: 0 auto;
}
.domain-header {
text-align: center;
margin-bottom: 28px;
position: relative;
z-index: 1;
}
.domain-title {
font-size: 2rem;
font-weight: 800;
position: relative;
display: inline-block;
margin-bottom: 8px;
padding: 4px 0;
color: transparent !important;
background: linear-gradient(120deg, rgba(255, 255, 255, 0.75) 0%, rgba(255, 210, 30, 0.95) 25%, rgba(255, 139, 0, 0.85) 50%, rgba(255, 210, 30, 0.95) 75%, rgba(255, 255, 255, 0.8) 100%);
background-size: 220% 100%;
-webkit-background-clip: text;
background-clip: text;
-webkit-text-fill-color: transparent;
text-shadow: 0 0 3px rgba(255, 210, 30, 0.08), 0 0 8px rgba(255, 210, 30, 0.05);
filter: drop-shadow(0 0 2px rgba(255, 210, 30, 0.06));
letter-spacing: 0.02em;
animation: title-shimmer 1.25s ease-in-out infinite;
font-family: 'Nanum Gothic', sans-serif !important;
}
@keyframes title-shimmer {
0% {
background-position: 0% 50%;
}
50% {
background-position: 100% 50%;
}
100% {
background-position: 0% 50%;
}
}
.domain-subtitle {
color: var(--text-secondary);
font-size: 1.2rem;
font-family: 'Geist', sans-serif;
}
/* Custom radio button styling */
.domain-radio {
display: flex !important;
gap: 12px !important;
flex-wrap: wrap !important;
justify-content: center !important;
position: relative;
z-index: 1;
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 6px 0 !important;
}
/* Gradio radio button wrapper */
.domain-radio .wrap {
display: flex !important;
gap: 12px !important;
flex-wrap: wrap !important;
justify-content: center !important;
width: 100% !important;
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 6px 0 !important;
}
.domain-selector-container [role="radiogroup"],
.domain-selector-container fieldset,
.domain-selector-container .gradio-radio,
.domain-selector-container .gradio-radio-group,
.domain-selector-container .gr-form,
.domain-selector-container .gradio-radio-group > div {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
}
#filters-sorting-container {
padding: 28px !important;
}
#filters-sorting-container .gr-box,
#filters-sorting-container .gradio-column,
#filters-sorting-container .gradio-row,
#filters-sorting-container .gradio-group,
#filters-sorting-container .gradio-radio,
#filters-sorting-container [role="radiogroup"],
#filters-sorting-container fieldset {
background: transparent !important;
border: none !important;
box-shadow: none !important;
}
.filters-sorting-row {
gap: 18px !important;
justify-content: center !important;
flex-wrap: nowrap !important;
}
.filter-group {
flex: 1 1 260px !important;
display: flex !important;
flex-direction: column !important;
gap: 12px !important;
align-items: flex-start !important;
width: 100% !important;
}
.filter-group-row {
display: flex !important;
align-items: center !important;
gap: 4px !important;
justify-content: flex-start !important;
flex-wrap: nowrap !important;
width: 100% !important;
}
.filter-group-row > * {
margin: 0 !important;
padding: 0 !important;
flex: 0 0 auto !important;
width: auto !important;
min-width: auto !important;
}
.filter-group-row > .gr-column,
.filter-group-row > .gr-box {
flex: 0 0 auto !important;
width: auto !important;
min-width: auto !important;
}
.filter-group-row .gradio-html,
.filter-group-row .gradio-html > * {
display: inline-flex !important;
align-items: center !important;
margin: 0 !important;
padding: 0 !important;
flex: 0 0 auto !important;
width: auto !important;
min-width: auto !important;
}
.filter-group-row .domain-radio {
flex: 1 1 auto !important;
width: 100% !important;
margin: 0 !important;
display: inline-flex !important;
align-items: center !important;
min-width: 220px !important;
justify-content: flex-start !important;
padding-right: 8px !important;
}
.filter-group .gr-input-label {
font-size: 1rem !important;
font-weight: 600 !important;
color: #FFFFFF !important;
text-align: center !important;
margin-bottom: 12px !important;
}
.filter-group-label {
font-size: 1rem !important;
font-weight: 600 !important;
color: #FFFFFF !important;
text-align: left !important;
margin: 0 !important;
font-family: 'Geist', sans-serif !important;
white-space: nowrap !important;
}
#filters-sorting-container .domain-radio,
#filters-sorting-container .domain-radio .wrap {
flex-wrap: nowrap !important;
justify-content: flex-start !important;
align-items: center !important;
gap: 12px !important;
width: 100% !important;
}
.domain-radio label,
.domain-radio .wrap > label {
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
gap: 0 !important;
padding: 12px 28px !important;
background: rgba(245, 246, 247, 0.06) !important;
border: 1px solid var(--border-subtle) !important;
border-radius: 999px !important;
cursor: pointer !important;
transition: all 0.3s ease !important;
text-align: center !important;
position: relative !important;
overflow: hidden !important;
color: #FFFFFF !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
letter-spacing: 0.01em !important;
text-align: center !important;
line-height: 1 !important;
backdrop-filter: blur(10px) !important;
-webkit-backdrop-filter: blur(10px) !important;
min-width: 0 !important;
flex: 0 0 auto !important;
box-shadow: 0 0 18px -6px rgba(0, 0, 0, 0.45) !important;
z-index: 0 !important;
}
.domain-radio label::before {
content: '';
position: absolute;
inset: 0;
background: #ffd21e14;
opacity: 0;
transition: opacity 0.3s ease;
pointer-events: none;
z-index: -1;
}
.domain-radio label:hover {
border-color: var(--accent-primary) !important;
background: rgba(255, 210, 30, 0.12) !important;
box-shadow: 0 0 18px 0 rgba(255, 210, 30, 0.45) !important;
border-color: rgba(255, 210, 30, 0.45) !important;
}
.domain-radio label:hover::before {
opacity: 1;
}
.domain-radio input[type="radio"] {
display: none !important;
}
.domain-radio input[type="radio"]:checked + label,
.domain-radio .wrap > label:has(input[type="radio"]:checked),
.domain-radio label.selected,
.domain-radio label[aria-checked="true"] {
background: #ffd21e33 !important;
border-color: var(--accent-primary) !important;
color: var(--accent-tertiary) !important;
box-shadow: 0 0 24px 0 rgba(255, 210, 30, 0.55) !important;
}
.domain-radio input[type="radio"]:checked + label::before,
.domain-radio label[aria-checked="true"]::before {
opacity: 1;
}
/* Model selector styling */
.model-selector-container {
padding: 28px;
}
.model-selector-container .domain-header {
margin-bottom: 18px;
}
.model-selector-container .domain-title {
font-size: 1.8rem;
}
.model-selector-container .domain-subtitle {
font-size: 1rem;
}
.model-selector-container .model-dropdown,
.model-selector-container .pill-button {
display: flex !important;
justify-content: center !important;
align-items: center !important;
width: 100% !important;
}
.model-selector-container .model-dropdown {
margin-bottom: 12px !important;
}
.model-dropdown {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
margin: 0 !important;
}
.model-dropdown .gradio-dropdown,
.model-dropdown .gradio-dropdown > div,
.model-dropdown .gr-form,
.model-dropdown .gradio-input,
.model-dropdown .gradio-button-group,
.model-dropdown .wrap {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
margin: 0 !important;
width: 100% !important;
}
.model-dropdown select,
.model-dropdown [role="combobox"] {
background: #000000 !important;
border: 1px solid #333333 !important;
border-radius: 999px !important;
padding: 12px 24px !important;
color: #FFFFFF !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 1rem !important;
letter-spacing: 0.01em !important;
min-height: 46px !important;
min-width: 240px !important;
width: 100% !important;
cursor: pointer !important;
transition: all 0.3s ease !important;
text-align: center !important;
text-align-last: center !important;
}
.model-dropdown select:focus,
.model-dropdown [role="combobox"]:focus-visible {
outline: none !important;
border-color: var(--accent-primary) !important;
box-shadow: 0 0 0 3px rgba(255, 210, 30, 0.25) !important;
}
.model-dropdown button {
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
gap: 8px !important;
width: 100% !important;
padding: 12px 24px !important;
background: #000000 !important;
border: 1px solid #333333 !important;
border-radius: 999px !important;
color: #FFFFFF !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
letter-spacing: 0.01em !important;
cursor: pointer !important;
transition: all 0.3s ease !important;
}
.model-dropdown button:hover:not(:disabled),
.model-dropdown [role="combobox"]:hover {
transform: translateY(-1px) !important;
border-color: var(--accent-primary) !important;
box-shadow: 0 12px 28px rgba(255, 210, 30, 0.25) !important;
background: rgba(255, 210, 30, 0.12) !important;
}
.model-dropdown .tags {
display: flex !important;
flex-wrap: wrap !important;
gap: 6px !important;
justify-content: center !important;
}
.model-dropdown .tag {
background: rgba(255, 210, 30, 0.18) !important;
border: 1px solid rgba(255, 210, 30, 0.35) !important;
color: #FFFFFF !important;
border-radius: 999px !important;
padding: 4px 10px !important;
font-size: 0.85rem !important;
font-weight: 500 !important;
}
.model-dropdown label {
display: flex !important;
flex-direction: column !important;
gap: 8px !important;
width: 100% !important;
align-items: center !important;
}
.model-dropdown label > span {
display: none !important;
}
.pill-button button {
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
gap: 8px !important;
padding: 12px 28px !important;
background: #ffd21e !important;
border: 1px solid rgba(255, 210, 30, 0.6) !important;
border-radius: 999px !important;
color: #FFFFFF !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
letter-spacing: 0.01em !important;
text-decoration: none !important;
cursor: pointer !important;
transition: all 0.3s ease !important;
box-shadow: 0 12px 28px rgba(255, 210, 30, 0.25) !important;
border-bottom: none !important;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.35) !important;
}
.pill-button button:hover:not(:disabled) {
transform: translateY(-2px) !important;
background: #ffd21e !important;
box-shadow: 0 16px 36px rgba(255, 210, 30, 0.35) !important;
text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important;
}
.pill-button button:disabled {
opacity: 0.6 !important;
cursor: not-allowed !important;
box-shadow: none !important;
}
.level-selector-container {
padding: 28px;
}
.level-selector-container .domain-header {
margin-bottom: 18px;
}
.level-selector-container .domain-title {
font-size: 1.8rem;
}
.level-selector-container .domain-subtitle {
font-size: 1rem;
}
/* Domain icons */
.domain-icon {
font-size: 1.5rem;
margin-bottom: 4px;
display: block;
filter: drop-shadow(0 0 10px white);
}
.domain-name {
font-size: 0.95rem;
font-weight: 500;
margin-top: 4px;
}
/* Badge for domain counts */
.domain-count {
position: absolute;
top: 8px;
right: 8px;
background: var(--accent-primary);
color: #FFFFFF;
font-size: 0.75rem;
padding: 2px 8px;
border-radius: 12px;
font-weight: 600;
opacity: 0.8;
}
/* Filter radio buttons styling - smaller for better fit */
.filter-radio {
max-width: 100% !important;
}
.filter-radio .gr-row {
gap: 8px !important;
}
.filter-radio .gr-column {
min-width: 0 !important;
flex: 1 !important;
}
.filter-radio .gr-form {
min-width: 0 !important;
}
.filter-radio .gr-radio-group {
gap: 4px !important;
}
.filter-radio .domain-radio {
display: flex !important;
gap: 4px !important;
flex-wrap: nowrap !important;
justify-content: center !important;
}
.filter-radio .domain-radio label {
min-width: auto !important;
max-width: 120px !important;
padding: 8px 12px !important;
font-size: 0.8rem !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
/* Additional targeting for the specific filter components */
.filter-radio .gr-box {
padding: 8px !important;
}
.filter-radio .gr-radio {
gap: 4px !important;
}
.filter-radio .gr-input-label {
font-size: 0.85rem !important;
margin-bottom: 4px !important;
}
/* Force compact layout for the filters */
@media (max-width: 1400px) {
.filter-radio .domain-radio label {
padding: 6px 10px !important;
font-size: 0.75rem !important;
}
}
/* Compact filter row styling */
.compact-filter-row {
margin-bottom: 20px !important;
}
.compact-filter-row .gr-column {
padding: 0 8px !important;
}
.compact-filter-row .gr-box {
padding: 0 !important;
}
/* Compact radio button styling */
.compact-radio {
width: 100% !important;
}
.compact-radio > label {
font-size: 0.85rem !important;
margin-bottom: 8px !important;
font-weight: 600 !important;
color: #FFFFFF !important;
display: block !important;
}
.compact-radio .wrap {
display: flex !important;
flex-wrap: nowrap !important;
gap: 4px !important;
justify-content: center !important;
}
.compact-radio .wrap > label {
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
padding: 6px 10px !important;
margin: 0 !important;
background: var(--bg-card) !important;
border: 1px solid var(--border-default) !important;
border-radius: 8px !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-size: 0.75rem !important;
white-space: nowrap !important;
flex: 1 !important;
min-width: 0 !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
.compact-radio .wrap > label:has(input[type="radio"]:checked) {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: #FFFFFF !important;
font-weight: 600 !important;
}
.compact-radio .wrap > label:hover {
background: rgba(255, 210, 30, 0.1) !important;
border-color: var(--accent-primary) !important;
transform: scale(1.02) !important;
}
.compact-radio input[type="radio"] {
display: none !important;
}
/* Target Gradio's data attributes for selected state */
.compact-radio label[data-selected="true"],
.compact-radio label[aria-checked="true"],
.domain-radio label[data-selected="true"],
.domain-radio label[aria-checked="true"] {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: #FFFFFF !important;
font-weight: 600 !important;
}
/* Sort by radio buttons */
.sort-by-radio .domain-radio {
display: flex !important;
gap: 10px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
}
.sort-by-radio .domain-radio .wrap {
display: flex !important;
gap: 10px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
width: 100% !important;
}
.sort-by-radio .domain-radio label,
.sort-by-radio .domain-radio .wrap > label {
min-width: 180px !important;
max-width: 220px !important;
padding: 12px 20px !important;
font-size: 0.95rem !important;
}
/* Leaderboard controls row styling */
.leaderboard-controls-row {
margin: 20px 0 !important;
padding: 20px !important;
background: transparent !important;
border: none !important;
gap: 40px !important;
}
.leaderboard-controls-row .gr-column,
.leaderboard-controls-row .gr-row,
.leaderboard-controls-row .gr-box,
.leaderboard-controls-row .gradio-column,
.leaderboard-controls-row .gradio-row,
.leaderboard-controls-row .gradio-group {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
}
/* Remove all container backgrounds for leaderboard controls */
.leaderboard-controls-row * {
background-color: transparent !important;
background-image: none !important;
border: none !important;
box-shadow: none !important;
}
.leaderboard-controls-row .inline-radio,
.leaderboard-controls-row .domain-radio {
background: transparent !important;
border: none !important;
box-shadow: none !important;
}
/* Inline radio styling for integrated controls */
.inline-radio {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
}
.inline-radio .wrap {
display: flex !important;
gap: 8px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
}
.inline-radio label {
padding: 8px 16px !important;
background: rgba(245, 246, 247, 0.06) !important;
border: 1px solid var(--border-subtle) !important;
border-radius: 20px !important;
font-size: 0.85rem !important;
color: #FFFFFF !important;
transition: all 0.2s ease !important;
cursor: pointer !important;
}
.inline-radio label:hover {
background: rgba(255, 210, 30, 0.12) !important;
border-color: var(--accent-primary) !important;
}
.inline-radio input[type="radio"]:checked + label,
.inline-radio label[aria-checked="true"] {
background: rgba(255, 210, 30, 0.2) !important;
border-color: var(--accent-primary) !important;
color: #FFFFFF !important;
font-weight: 600 !important;
}
</style>
""")
level_options = list(level_details.keys())
# Main leaderboard table with dynamic title and integrated controls
leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
# Integrated controls within leaderboard section - stacked vertically
gr.HTML("<p style='color: var(--text-primary); margin: 5px 0 5px 0; font-size: 1.2rem;'>Select Task Level</p>")
domain_filter = gr.Radio(
choices=level_options,
value=default_level,
label="",
interactive=True,
container=False,
elem_classes=["domain-radio", "inline-radio"]
)
gr.HTML("<p style='color: var(--text-primary); margin: 5px 0 0px 0; font-size: 1.2rem;'>πŸ” Filters & Sorting</p>")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<span style='color: var(--text-primary); font-size: 1.2rem; margin-bottom: 5px; display: block;'>Model Access</span>")
model_type_filter = gr.Radio(
choices=["All", "OSS", "API"],
value="All",
label="",
elem_classes=["domain-radio", "inline-radio"],
container=False
)
with gr.Column(scale=1):
gr.HTML("<span style='color: var(--text-primary);>Sort Order</span>")
sort_order = gr.Radio(
choices=["Descending", "Ascending"],
value="Descending",
label="",
elem_classes=["domain-radio", "inline-radio"],
container=False
)
leaderboard_table = gr.HTML(initial_table)
# Radar Chart Section
gr.HTML("""
<div class="domain-selector-container domain-performance-container">
<div class="domain-header">
<h2 class="domain-title" >Core Capability Radar</h2>
<p class="domain-subtitle" style="color: var(--text-primary);">Track six essential axes: <br>success, execution, reasoning, robustness, efficiency, and call validity.</p>
</div>
""")
gr.HTML("<p >Select models to compare (up to 5).</p>")
# gr.HTML("<p style='color: #b0b0b0; margin: 0 0 10px 0; font-size: 0.9rem;'>You can select up to five models.</p>")
model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist()[:10],
value=initial_df['Model'].tolist()[:5],
multiselect=True,
label="",
info=None,
container=False,
)
# Radar chart plot - wrapped in centered container
gr.HTML('<div class="chart-container radar-chart-container">')
radar_chart = gr.Plot(
label="",
value=create_domain_radar_chart(
load_leaderboard_data(),
initial_df['Model'].tolist()[:5]
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Define generate_performance_card function before using it
def generate_performance_card(model_name):
"""Generate HTML for the model performance card"""
if not model_name:
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
Please select a model to generate its performance card
</div>"""
# Get model data
df = load_leaderboard_data()
model_data = df[df['Model'] == model_name]
if model_data.empty:
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
Model not found in the database
</div>"""
row = model_data.iloc[0]
# Get overall rank based on overall success
df_with_success = df.copy()
df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
try:
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
except:
rank = 'N/A'
# Format values
def format_value(val, decimals=3, prefix='', suffix=''):
if pd.isna(val) or val == '':
return 'N/A'
return f"{prefix}{float(val):.{decimals}f}{suffix}"
def format_score(value):
if pd.isna(value) or value == '':
return 'N/A'
return f"{float(value):.3f}"
radar_metrics = [
("Execution Accuracy", row.get('Execution Accuracy')),
("Complex Reasoning", row.get('Complex Reasoning')),
("Robustness", row.get('Robustness')),
("Context & Efficiency", row.get('Context & Efficiency')),
("Overall Success", row.get('Overall Success')),
("Validity", row.get('Call Validity')),
]
radar_values = []
radar_labels = []
for label, value in radar_metrics:
if pd.isna(value) or value == '':
radar_values.append(0.0)
else:
try:
radar_values.append(max(0.0, min(1.0, float(value))))
except (TypeError, ValueError):
radar_values.append(0.0)
radar_labels.append(label)
mini_radar_html = build_static_radar_chart(radar_values, radar_labels)
level_blocks = []
for level in level_ids:
sr_col = sr_column_map.get(level)
level_blocks.append((level, row.get(sr_col, '')))
evaluation_date = EVALUATION_DATE
icon_html = ""
if KREW_ICON_BASE64:
icon_html = f'<img src="data:image/png;base64,{KREW_ICON_BASE64}" alt="Krew icon" />'
else:
icon_html = '<div class="icon-fallback">πŸ€–</div>'
card_html = f"""
<div class="performance-card">
<div class="card-top-row">
<div class="model-identity">
<div class="model-icon">{icon_html}</div>
<div class="model-meta">
<div class="card-model-name">{model_name}</div>
<div class="meta-line">Vendor Β· <span>{row['Vendor']}</span></div>
<div class="meta-line evaluation">
<span class="date-pill">Evaluation Date</span>
<span>{evaluation_date}</span>
</div>
</div>
</div>
<div class="rank-panel">
<div class="rank-label">RANK</div>
<div class="rank-value">#{rank}</div>
</div>
</div>
<div class="card-main">
<div class="card-body">
<div class="radar-slot">
{mini_radar_html}
</div>
<div class="core-section">
<div class="core-metric-grid">
"""
ordered_labels = ["Execution Accuracy", "Complex Reasoning", "Robustness", "Context & Efficiency", "Overall Success", "Validity"]
ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
top_metrics = ordered_metrics[:3]
bottom_metrics = ordered_metrics[3:]
card_html += """
<div class="core-metric-row">
"""
for label, value in top_metrics:
card_html += f"""
<div class="core-metric-card">
<div class="metric-label">{label}</div>
<div class="metric-value">{format_score(value)}</div>
</div>
"""
card_html += """
</div>
<div class="core-metric-row">
"""
for label, value in bottom_metrics:
card_html += f"""
<div class="core-metric-card">
<div class="metric-label">{label}</div>
<div class="metric-value">{format_score(value)}</div>
</div>
"""
card_html += """
</div>
</div>
</div>
</div>
<div class="level-strip">
"""
for level, value in level_blocks:
card_html += f"""
<div class="level-tile">
<div class="level-tile-label">{level}</div>
<div class="level-tile-score">{format_score(value)}</div>
</div>
"""
card_html += """
</div>
</div>
</div>
"""
return card_html
# MODEL PERFORMANCE CARD SECTION
gr.HTML("""
<div class="domain-selector-container performance-card-container">
<div class="domain-header">
<h2 class="domain-title" >Model Performance Card</h2>
<p class="domain-subtitle" style="color: var(--text-primary);">
Explore detailed performance cards that visualize six core metrics plus overall SR across L1–L7 levels.
</p>
<p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
β€» Ranks are determined by the average SR across L1–L7.
</p>
</div>
<div class="performance-card-content">
""")
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
gr.HTML("""
<p class="domain-subtitle" style="color: var(--text-primary);">Choose a model to generate its analysis card.</p>
""")
card_model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist(),
value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
)
download_card_btn = gr.Button(
"Download as PNG",
elem_id="download-card-btn-en",
elem_classes=["pill-button"]
)
gr.HTML("""
<div class="performance-card-display" id="card-display-container-en">
""")
# Card display area - generate initial card
initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
initial_card_html = generate_performance_card(initial_model) if initial_model else ""
card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html-en")
gr.HTML("""
</div>
</div>
</div>
""")
# Level metric breakdown section
gr.HTML("""
<div class="domain-selector-container domain-performance-container level-metrics-wrapper">
<div class="domain-header">
<h2 class="domain-title" >Level-specific Metrics</h2>
<p class="domain-subtitle" style="color: var(--text-primary);">Compare model scores with each Ko-AgentBench level's dedicated metrics for deeper insights.</p>
</div>
""")
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
level_metric_selector = gr.Dropdown(
choices=level_ids,
value=level_ids[0] if level_ids else None,
multiselect=False,
label="",
info=None,
container=False,
elem_classes=["level-dropdown"]
)
level_model_selector = gr.Dropdown(
choices=initial_level_model_choices,
value=initial_level_model_values,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
)
gr.HTML('<div class="chart-container level-metric-chart-container">')
level_metric_chart = gr.Plot(
label="",
value=initial_level_metric_chart,
elem_classes=["level-metric-plot", "plot-container"]
)
gr.HTML("""
</div>
</div>
""")
# # Heatmap section
# gr.HTML("""
# <div class="domain-selector-container domain-performance-container heatmap-wrapper">
# <div class="domain-header">
# <h2 class="domain-title" >Comprehensive Performance Heatmap</h2>
# <p class="domain-subtitle" >See each model's L1–L7 SR scores at a glance.</p>
# </div>
# <div class="chart-container heatmap-chart-container">
# """)
# heatmap_chart = gr.Plot(
# label="",
# value=initial_heatmap,
# elem_classes=["heatmap-plot", "plot-container"]
# )
# gr.HTML("""
# </div>
# </div>
# """)
# Update functions
def get_optimal_sort_order(sort_by_value):
"""Return the optimal sort order for a given metric"""
# Metrics where higher is better (descending)
descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
# Metrics where lower is better (ascending)
ascending_metrics = []
if sort_by_value in descending_metrics:
return "Descending"
elif sort_by_value in ascending_metrics:
return "Ascending"
else:
return "Descending" # Default fallback
def update_table(level_filter, model_type_filter, sort_order):
title_html = update_leaderboard_title(level_filter)
sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
return title_html, table_html
def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
# Update model selector choices based on filtered data
available_models_all = filtered_df['Model'].tolist()
available_models = available_models_all[:15] # Top 15 from filtered results
# If selected models are not in available models, reset to top 5
if selected_models:
valid_selected = [m for m in selected_models if m in available_models]
# Check if more than 5 models are selected and show alert
if len(valid_selected) > 5:
gr.Warning("You can select up to 5 models.")
# Remove the last selected item (6th item) instead of keeping first 5
valid_selected = valid_selected[:-1]
if not valid_selected:
valid_selected = available_models[:5]
else:
valid_selected = available_models[:5]
# Create radar chart
chart = create_domain_radar_chart(filtered_df, valid_selected)
# Prepare heatmap order prioritizing selected models
# Level metric chart
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
),
chart,
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models_all = filtered_df['Model'].tolist()
if selected_models:
valid_selected = [m for m in selected_models if m in available_models_all]
# Check if more than 5 models are selected and show alert
if len(valid_selected) > 5:
# JavaScript alert for exceeding 5 models
gr.Warning("You can select up to 5 models.")
# Remove the last selected item (6th item) instead of keeping first 5
valid_selected = valid_selected[:-1]
if not valid_selected:
valid_selected = available_models_all[:5]
else:
valid_selected = available_models_all[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models_all[:15],
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
),
create_domain_radar_chart(filtered_df, valid_selected),
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models = filtered_df['Model'].tolist()
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_models]
# Check if more than 5 models are selected and show alert
if len(valid_level_models) > 5:
gr.Warning("You can select up to 5 models.")
# Remove the last selected item (6th item) instead of keeping first 5
valid_level_models = valid_level_models[:-1]
if not valid_level_models:
valid_level_models = available_models[:5]
else:
valid_level_models = available_models[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_chart,
)
# Update table when filters change
filter_inputs = [domain_filter, model_type_filter, sort_order]
for input_component in filter_inputs:
input_component.change(
fn=update_table,
inputs=filter_inputs,
outputs=[leaderboard_title, leaderboard_table]
)
# Also update radar chart when filters change
input_component.change(
fn=update_radar_chart,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
)
# Update radar chart when model selection changes
model_selector.change(
fn=update_radar_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
)
level_metric_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
level_model_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
# Add custom CSS for the performance card
gr.HTML("""
<style>
/* Performance Card Styles */
.performance-card {
background: linear-gradient(140deg, rgba(1, 9, 26, 0.95) 0%, rgba(20, 34, 58, 0.65) 100%);
border: 1px solid rgba(255, 210, 30, 0.35);
border-radius: 24px;
padding: 28px;
max-width: 820px;
margin: 0 auto;
box-shadow: 0 18px 36px rgba(0, 0, 0, 0.35);
display: flex;
flex-direction: column;
gap: 28px;
}
.card-top-row {
display: flex;
justify-content: space-between;
gap: 24px;
align-items: stretch;
}
.card-main {
display: flex;
flex-direction: column;
gap: 20px;
}
.model-identity {
display: flex;
gap: 20px;
align-items: center;
}
.model-icon {
width: 88px;
height: 88px;
background: rgba(245, 246, 247, 0.08);
border-radius: 20px;
display: flex;
align-items: center;
justify-content: center;
padding: 12px;
border: 1px solid rgba(245, 246, 247, 0.12);
}
.model-icon img {
width: 100%;
height: 100%;
object-fit: contain;
}
.icon-fallback {
font-size: 2.8rem;
}
.model-meta {
display: flex;
flex-direction: column;
gap: 6px;
}
.card-model-name {
font-size: 1.9rem;
font-weight: 800;
letter-spacing: 0.01em;
color: #FFFFFF;
}
.meta-line {
font-size: 0.95rem;
color: var(--text-secondary);
display: flex;
gap: 6px;
align-items: center;
}
.meta-line span {
color: #FFFFFF;
font-weight: 600;
}
.date-pill {
display: inline-flex;
align-items: center;
gap: 6px;
background: rgba(255, 210, 30, 0.18);
color: var(--accent-primary);
padding: 2px 10px;
border-radius: 999px;
font-size: 0.75rem;
font-weight: 600;
letter-spacing: 0.05em;
text-transform: uppercase;
}
.rank-panel {
min-width: 140px;
background: rgba(255, 210, 30, 0.12);
border: 1px solid rgba(255, 210, 30, 0.35);
border-radius: 20px;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
padding: 18px;
gap: 6px;
}
.rank-label {
font-size: 0.9rem;
letter-spacing: 0.1em;
color: var(--text-secondary);
}
.rank-value {
font-size: 2.4rem;
font-weight: 800;
color: #FFFFFF;
letter-spacing: 0.04em;
}
.radar-legend {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(130px, 1fr));
gap: 10px;
font-size: 0.8rem;
color: var(--text-tertiary);
letter-spacing: 0.04em;
text-transform: uppercase;
}
.card-body {
display: grid;
grid-template-columns: 240px 1fr;
gap: 28px;
align-items: start;
}
.radar-slot {
width: 240px;
height: 240px;
display: flex;
align-items: center;
justify-content: center;
border-radius: 20px;
background: rgba(245, 246, 247, 0.04);
border: 1px dashed rgba(245, 246, 247, 0.12);
padding: 10px;
box-sizing: border-box;
justify-self: center;
}
.radar-slot svg {
width: 100%;
height: 100%;
}
.heatmap-wrapper {
margin-top: 40px;
}
.heatmap-chart-container {
padding: 0 20px 32px;
}
.radar-chart-container {
padding: 0 20px 32px;
}
.heatmap-plot {
width: 100%;
max-width: 1400px;
margin: 0 auto;
border-radius: 22px;
overflow: hidden;
background: rgba(245, 246, 247, 0.02);
border: 1px solid var(--border-subtle);
box-shadow: 0 14px 40px rgba(0, 0, 0, 0.35);
}
.heatmap-plot .modebar {
display: none;
}
.level-metrics-wrapper {
margin-top: 40px;
}
.level-metric-controls {
justify-content: center;
align-items: center;
margin-bottom: 20px;
gap: 12px;
}
.level-metric-chart-container {
padding: 0 20px 32px;
}
.level-metric-plot {
width: 100%;
max-width: 1400px;
margin: 0 auto;
border-radius: 22px;
overflow: hidden;
background: rgba(245, 246, 247, 0.02);
border: 1px solid var(--border-subtle);
box-shadow: 0 14px 40px rgba(0, 0, 0, 0.35);
}
.level-dropdown {
max-width: 260px !important;
margin: 0 auto !important;
}
.level-dropdown select,
.level-dropdown [role="combobox"],
.level-dropdown button {
background: #000000 !important;
border: 1px solid #333333 !important;
border-radius: 999px !important;
padding: 12px 20px !important;
color: #FFFFFF !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
text-align: center !important;
min-height: 46px !important;
transition: all 0.3s ease !important;
box-shadow: 0 10px 24px rgba(0, 0, 0, 0.3) !important;
}
.level-dropdown select:hover,
.level-dropdown [role="combobox"]:hover,
.level-dropdown button:hover:not(:disabled) {
transform: translateY(-1px) !important;
border-color: var(--accent-primary) !important;
box-shadow: 0 12px 28px rgba(255, 210, 30, 0.25) !important;
background: rgba(255, 210, 30, 0.12) !important;
}
.level-model-dropdown {
width: 100% !important;
margin: 12px auto 0 !important;
}
.level-model-dropdown select,
.level-model-dropdown [role="combobox"],
.level-model-dropdown button {
background: #000000 !important;
border: 1px solid #333333 !important;
color: #FFFFFF !important;
}
.radar-placeholder {
display: flex;
flex-direction: column;
gap: 10px;
color: var(--text-secondary);
font-size: 1rem;
letter-spacing: 0.03em;
}
.radar-placeholder small {
font-size: 0.68rem;
line-height: 1.6;
color: var(--text-tertiary);
}
.core-section {
display: flex;
flex-direction: column;
gap: 20px;
}
.core-metric-grid {
display: flex;
flex-direction: column;
gap: 12px;
}
.core-metric-row {
display: grid;
grid-template-columns: repeat(3, minmax(150px, 1fr));
gap: 12px;
}
.core-metric-card {
background: rgba(245, 246, 247, 0.06);
border: 1px solid rgba(245, 246, 247, 0.12);
border-radius: 16px;
padding: 18px;
display: flex;
flex-direction: column;
gap: 8px;
transition: transform 0.25s ease, border-color 0.25s ease;
}
.core-metric-card:hover {
transform: translateY(-4px);
border-color: rgba(255, 210, 30, 0.45);
}
.core-metric-card .metric-label {
font-size: 0.95rem;
color: var(--text-secondary);
font-weight: 600;
}
.core-metric-card .metric-value {
font-size: 1.8rem;
font-weight: 700;
color: #FFFFFF;
font-family: 'Geist Mono', monospace;
}
.level-strip {
display: grid;
grid-template-columns: repeat(7, 1fr);
gap: 12px;
margin-top: 0;
width: 100%;
}
.level-tile {
background: rgba(245, 246, 247, 0.05);
border: 1px solid rgba(245, 246, 247, 0.1);
border-radius: 12px;
padding: 12px 10px;
display: flex;
flex-direction: column;
gap: 4px;
text-align: center;
min-width: 0;
}
.level-tile-label {
font-size: 0.82rem;
font-weight: 600;
color: var(--text-secondary);
}
.level-tile-score {
font-size: 1.25rem;
font-weight: 700;
color: #FFFFFF;
font-family: 'Geist Mono', monospace;
}
@media (max-width: 980px) {
.performance-card {
padding: 20px;
}
.card-top-row {
flex-direction: column;
}
.rank-panel {
align-self: flex-start;
}
.card-body {
grid-template-columns: 1fr;
}
.radar-slot {
width: 100%;
max-width: 280px;
margin: 0 auto;
}
.radar-chart-container {
overflow-x: auto;
padding: 0 12px 28px;
-webkit-overflow-scrolling: touch;
}
.radar-metric-plot {
min-width: 720px;
}
.level-strip {
grid-template-columns: repeat(4, 1fr);
}
.heatmap-chart-container {
overflow-x: auto;
padding: 0 12px 28px;
-webkit-overflow-scrolling: touch;
}
.heatmap-plot {
min-width: 720px;
}
.level-metric-chart-container {
overflow-x: auto;
padding: 0 12px 28px;
-webkit-overflow-scrolling: touch;
}
.level-metric-plot {
min-width: 720px;
}
}
@media (max-width: 640px) {
.level-strip {
grid-template-columns: repeat(3, 1fr);
}
.heatmap-plot {
min-width: 640px;
}
.level-metric-plot {
min-width: 640px;
}
}
/* Force fonts - highest priority */
.dashboard-section,
.dashboard-section *,
.dashboard-section h2,
.dashboard-section h3,
.dashboard-section p,
.dashboard-section li,
.section-lead,
.section-subtitle,
.phase-card h3,
.phase-list li,
.scenario-body p,
.criteria-card h3,
.criteria-card ul,
.criteria-card li {
font-family: "Nanum Gothic", sans-serif !important;
}
/* Force section-title styling */
.section-title,
h2.section-title,
.dashboard-section .section-title,
.section-header .section-title {
font-family: "Nanum Gothic", sans-serif !important;
}
.domain-title,
h2.domain-title,
.domain-header .domain-title {
font-family: "Nanum Gothic", sans-serif !important;
}
.hero-title,
.hero-subtitle,
h1.hero-title,
p.hero-subtitle {
font-family: "Nanum Gothic", sans-serif !important;
font-size: 2rem; !important;
}
/* Force hero-title sizing */
.hero-title,
h1.hero-title {
font-size: 4rem !important;
}
.phase-chart span,
.phase-card .phase-chart span,
.phase-grid .phase-chart span {
font-family: "Nanum Gothic", sans-serif !important;
font-size: 1.2rem !important;
}
.section-lead, .section-subtitle {
font-size: 1.32rem !important;
font-family: "Nanum Gothic", sans-serif !important;
}
.phase-card h3 {
font-size: 1.44rem !important;
font-family: "Nanum Gothic", sans-serif !important;
}
.phase-list li {
font-size: 1.08rem !important;
font-family: "Nanum Gothic", sans-serif !important;
}
</style>
""")
# Wire up the card generator to selection change
card_model_selector.change(
fn=generate_performance_card,
inputs=[card_model_selector],
outputs=[card_display]
)
# Wire up download button with html2canvas capture
download_card_btn.click(
fn=None,
js="""
async () => {
const ensureHtml2Canvas = () => new Promise((resolve, reject) => {
if (window.html2canvas) {
resolve(window.html2canvas);
return;
}
const existing = document.querySelector('script[data-html2canvas]');
if (existing) {
existing.addEventListener('load', () => resolve(window.html2canvas));
existing.addEventListener('error', reject);
return;
}
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js';
script.async = true;
script.dataset.html2canvas = 'true';
script.onload = () => resolve(window.html2canvas);
script.onerror = () => reject(new Error('Failed to load html2canvas'));
document.head.appendChild(script);
});
const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms));
await pause(60);
const container = document.getElementById('performance-card-html-en');
const card = container?.querySelector('.performance-card');
if (!container || !card) {
alert('Performance card not found. Please select a model first.');
return;
}
const btn = document.getElementById('download-card-btn-en');
const originalText = btn?.textContent || '';
if (btn) {
btn.textContent = 'Generating...';
btn.disabled = true;
}
try {
const html2canvasLib = await ensureHtml2Canvas();
if (!html2canvasLib) {
throw new Error('html2canvas unavailable');
}
const canvas = await html2canvasLib(card, {
backgroundColor: '#01091A',
scale: 2,
logging: false,
useCORS: true
});
if (!canvas || !canvas.width || !canvas.height) {
throw new Error('Captured canvas is empty');
}
const link = document.createElement('a');
const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
const timestamp = new Date().toISOString().slice(0, 10);
const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
link.download = fileName;
const dataUrl = canvas.toDataURL('image/png');
if (!dataUrl || dataUrl === 'data:,' || dataUrl.length <= 'data:image/png;base64,'.length) {
throw new Error('Failed to generate PNG data');
}
link.href = dataUrl;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
} catch (error) {
console.error('Error capturing card:', error);
alert('Failed to capture performance card. Please try again.');
} finally {
if (btn) {
btn.textContent = originalText;
btn.disabled = false;
}
}
}
"""
)
# Also update card when filters change to keep model selector in sync
for input_component in filter_inputs:
def update_dropdown_and_card(*args):
filtered_df, _, _ = apply_filters(
load_leaderboard_data(),
args[0],
args[1],
args[2],
"Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success")
)
choices = filtered_df['Model'].tolist()
# Select first model from filtered list
value = choices[0] if choices else None
return gr.Dropdown(
choices=choices,
value=value,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
)
input_component.change(
fn=update_dropdown_and_card,
inputs=filter_inputs,
outputs=[card_model_selector]
)
return leaderboard_table
def create_leaderboard_v2_interface():
"""Create the complete leaderboard v1 interface"""
return create_leaderboard_v2_tab()
def create_domain_radar_chart(df, selected_models=None, max_models=5):
"""Visualize six core capability metrics on a radar chart."""
df = df.copy()
metrics_info = [
{"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
{"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM Β· ArgAcc Β· SelectAcc"},
{"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc Β· PSM Β· Coverage"},
{"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting Β· FallbackSR"},
{"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate Β· EffScore Β· ContextRetention"},
{"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
]
required_columns = [m["column"] for m in metrics_info]
if df.empty or not any(col in df.columns for col in required_columns):
return create_empty_radar_chart("Not enough data to build the capability radar")
# Default model selection
if not selected_models:
if "Overall Success" in df.columns:
top_models = df.sort_values("Overall Success", ascending=False)
else:
top_models = df
selected_models = top_models['Model'].head(max_models).tolist()
selected_models = selected_models[:max_models]
# Ensure metric columns are numeric
for metric in metrics_info:
col = metric["column"]
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
fig = go.Figure()
angle_labels = [m["label"] for m in metrics_info]
palette = [
{'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
{'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
{'fill': 'rgba(161, 98, 7, 0.22)', 'line': '#A16207'},
{'fill': 'rgba(220, 38, 38, 0.20)', 'line': '#DC2626'},
{'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
]
for idx, model_name in enumerate(selected_models):
model_data = df[df['Model'] == model_name]
if model_data.empty:
continue
row = model_data.iloc[0]
values = []
tooltips = []
for metric in metrics_info:
col = metric["column"]
value = row[col] if col in row else float('nan')
if pd.isna(value) or value == '':
value = 0
values.append(float(value))
tooltips.append(metric["description"])
if not values:
continue
values_loop = values + [values[0]]
angles_loop = angle_labels + [angle_labels[0]]
tooltips_loop = tooltips + [tooltips[0]]
colors = palette[idx % len(palette)]
fig.add_trace(
go.Scatterpolar(
r=values_loop,
theta=angles_loop,
fill='toself',
fillcolor=colors['fill'],
line=dict(color=colors['line'], width=3),
marker=dict(
size=10,
color=colors['line'],
symbol='circle',
line=dict(width=2, color='#01091A')
),
name=model_name,
customdata=tooltips_loop,
mode="lines+markers",
hovertemplate="<b>%{fullData.name}</b><br>" +
"<span style='color: #94A3B8'>%{theta}</span><br>" +
"<span style='color: #F5E7CB; font-size: 12px;'>%{customdata}</span><br>" +
"<b style='font-size: 14px; color: #F5F6F7'>%{r:.3f}</b><br>" +
"<extra></extra>",
hoverlabel=dict(
bgcolor="rgba(1, 9, 26, 0.95)",
bordercolor=colors['line'],
font=dict(color="white", size=12, family="'Geist', sans-serif")
)
)
)
tick_vals = [i / 5 for i in range(6)]
tick_text = [f"{val:.2f}" for val in tick_vals]
fig.update_layout(
polar=dict(
bgcolor='rgba(245, 246, 247, 0.03)',
radialaxis=dict(
visible=True,
range=[0, 1],
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.1)',
gridwidth=1,
tickvals=tick_vals,
ticktext=tick_text,
tickfont=dict(
size=11,
color='white',
family="'Geist Mono', monospace"
)
),
angularaxis=dict(
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.08)',
tickfont=dict(
size=13,
family="'Geist', sans-serif",
color='white',
weight=600
),
rotation=90,
direction="clockwise",
),
),
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.15,
xanchor="center",
x=0.5,
font=dict(size=12, family="'Geist', sans-serif", color='white'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
itemsizing='constant',
itemwidth=30
),
title=dict(
text="<b>Core Capability Radar</b>",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="white",
weight=700
),
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=900,
margin=dict(t=30, b=50, l=10, r=10),
autosize=True,
annotations=[]
)
return fig
def create_performance_heatmap(df, ordered_models=None, max_models=12):
"""Render a heatmap of SR scores across task levels for selected models."""
df = df.copy()
level_sequence = [f"L{i}" for i in range(1, 8)]
sr_columns = []
for level in level_sequence:
col = f"{level}_SR"
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
sr_columns.append((level, col))
if df.empty or not sr_columns:
return create_empty_heatmap("Not enough SR data to render the heatmap")
df = df.drop_duplicates(subset=["Model"])
if df.empty:
return create_empty_heatmap("No models available to render the heatmap")
sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0]
df = df.sort_values(sort_column, ascending=False)
if ordered_models:
ordered_models = [m for m in ordered_models if m in df["Model"].tolist()]
else:
ordered_models = df["Model"].tolist()
if not ordered_models:
return create_empty_heatmap("No models available to render the heatmap")
ordered_models = ordered_models[:max_models]
heatmap_df = df.set_index("Model").reindex(ordered_models)
level_labels = []
z_matrix = []
has_values = False
for level, col in sr_columns:
if col not in heatmap_df.columns:
continue
label = f"{level} Β· SR"
level_labels.append(label)
row_values = []
for model in ordered_models:
value = heatmap_df.at[model, col] if model in heatmap_df.index else None
if pd.isna(value):
row_values.append(None)
else:
val = float(value)
row_values.append(val)
has_values = True
z_matrix.append(row_values)
if not level_labels or not has_values:
return create_empty_heatmap("Not enough SR data to render the heatmap")
colorscale = [
[0.0, "#0A0A0A"],
[0.25, "#1A1411"],
[0.5, "#332818"],
[0.75, "#B8660A"],
[1.0, "#FFD21E"],
]
fig = go.Figure()
fig.add_trace(
go.Heatmap(
z=z_matrix,
x=ordered_models,
y=level_labels,
colorscale=colorscale,
zmin=0,
zmax=1,
hovertemplate="<b>%{y}</b><br><span style='color:#FFD21E'>%{x}</span><br>SR Β· %{z:.3f}<extra></extra>",
colorbar=dict(
title="Success Rate",
titlefont=dict(color="white", family="'Geist', sans-serif", size=12),
tickfont=dict(color="white", family="'Geist', sans-serif", size=10),
thickness=12,
len=0.7,
outlinecolor="rgba(255, 255, 255, 0.1)",
bgcolor="rgba(1, 9, 26, 0.75)"
),
showscale=True
)
)
annotations = []
for y_idx, level in enumerate(level_labels):
for x_idx, model in enumerate(ordered_models):
value = z_matrix[y_idx][x_idx]
if value is None:
continue
font_color = "#0B1120" if value >= 0.6 else "#F8FAFC"
annotations.append(
dict(
x=model,
y=level,
text=f"{value:.3f}",
showarrow=False,
font=dict(
family="'Geist Mono', monospace",
size=11,
color=font_color
)
)
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
margin=dict(t=80, b=90, l=110, r=160),
height=520,
width=1450,
font=dict(family="'Geist', sans-serif", color="white"),
xaxis=dict(
tickangle=-25,
showgrid=False,
ticks="",
tickfont=dict(size=11, family="'Geist', sans-serif", color="white")
),
yaxis=dict(
showgrid=False,
ticks="",
tickfont=dict(size=12, family="'Geist', sans-serif", color="white")
),
annotations=annotations,
title=dict(
text="<b>Comprehensive Performance Heatmap</b>",
x=0.5,
y=0.98,
font=dict(
size=20,
family="'Geist', sans-serif",
color="white",
weight=700
),
)
)
fig.update_xaxes(side="bottom")
return fig
def create_empty_heatmap(message):
"""Render an empty state for the heatmap with a centered message."""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ—ΊοΈ {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=520,
# width=1450,
autosize=True,
margin=dict(t=80, b=80, l=80, r=160),
title=dict(
text="<b>Comprehensive Performance Heatmap</b>",
x=0.5,
y=0.98,
font=dict(
size=20,
family="'Geist', sans-serif",
color="white",
weight=700
),
)
)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
return fig
def create_level_metric_chart(df, level, selected_models=None, max_models=5):
"""Render a grouped horizontal bar chart showing per-model scores for a level's metrics."""
if not level:
return create_empty_level_metric_chart("Select a level to view its metrics")
df = df.copy()
level_prefix = f"{level}_"
level_columns = [col for col in df.columns if col.startswith(level_prefix)]
metric_columns = []
for col in level_columns:
metric_suffix = col[len(level_prefix):]
metric_key_lower = metric_suffix.lower()
if "cost" in metric_key_lower:
continue
numeric_series = pd.to_numeric(df[col], errors='coerce')
valid_values = numeric_series.dropna()
if valid_values.empty:
continue
if (valid_values < 0).any() or (valid_values > 1.05).any():
continue
df[col] = numeric_series
metric_columns.append(col)
if not metric_columns:
return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize")
df = df.drop_duplicates(subset=['Model'])
if df.empty:
return create_empty_level_metric_chart("No models available to render level metrics")
if selected_models:
model_order = [m for m in selected_models if m in df['Model'].tolist()]
else:
sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0]
model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist()
if not model_order:
model_order = df['Model'].tolist()
model_order = model_order[:max_models]
df_models = df[df['Model'].isin(model_order)].set_index('Model')
if df_models.empty:
return create_empty_level_metric_chart("No matching models for selected filters")
def prettify_metric_name(metric_key):
raw = metric_key[len(level_prefix):]
text = raw.replace('_', ' ')
text = re.sub(r'(?<=.)([A-Z])', r' \1', text)
text = text.replace('Avg', 'Average')
replacements = {
'Sr': 'SR',
'Ac': 'AC',
'Tsq': 'TSQ',
'Cvr': 'CVR',
'Psm': 'PSM',
'Prov': 'Prov',
'Call Em': 'CallEM',
'Reuse Rate': 'Reuse Rate',
'Eff Score': 'Eff Score'
}
words = text.title().split()
words = [replacements.get(word, word) for word in words]
return ' '.join(words)
metric_labels = []
for col in metric_columns:
label = prettify_metric_name(col)
if label in metric_labels:
suffix = 2
while f"{label} ({suffix})" in metric_labels:
suffix += 1
label = f"{label} ({suffix})"
metric_labels.append(label)
model_palette = [
'#ffd21e',
'#FF8A3C',
'#A16207',
'#DC2626',
'#F8FAFC',
'#38BDF8',
]
fig = go.Figure()
max_value = 0
for idx, model in enumerate(model_order):
values = []
for col in metric_columns:
value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan')
if pd.notna(value):
values.append(float(value))
max_value = max(max_value, float(value))
else:
values.append(None)
color = model_palette[idx % len(model_palette)]
fig.add_trace(
go.Bar(
name=model,
y=metric_labels,
x=values,
orientation='h',
marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)),
hovertemplate="<b>%{y}</b><br>Model Β· <span style='color:#FFD21E'>%{fullData.name}</span><br>Score Β· %{x:.3f}<extra></extra>",
)
)
plot_height = max(360, 140 + 48 * len(metric_labels))
if max_value <= 0:
x_range = [0, 1]
else:
x_range = [0, max_value * 1.05]
fig.update_layout(
barmode='group',
bargap=0.25,
bargroupgap=0.18,
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=plot_height,
# width=1450,
autosize=True,
margin=dict(t=90, b=80, l=220, r=160),
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
bgcolor='rgba(1, 9, 26, 0.75)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
font=dict(size=11, family="'Geist', sans-serif", color='white')
),
xaxis=dict(
title=dict(text=f"<b>{level} Metric Score</b>", font=dict(size=14, color="white")),
tickfont=dict(size=11, color="white"),
gridcolor='rgba(245, 246, 247, 0.08)',
zerolinecolor='rgba(245, 246, 247, 0.18)',
range=x_range
),
yaxis=dict(
tickfont=dict(size=13, color="white"),
automargin=True
),
title=dict(
text=f"<b>{level} Metric Breakdown</b>",
x=0.5,
y=0.98,
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
)
)
return fig
def create_empty_level_metric_chart(message):
fig = go.Figure()
fig.add_annotation(
text=f"🧭 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(size=18, color="white", family="'Geist', sans-serif"),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=420,
width=1450,
margin=dict(t=80, b=60, l=80, r=120),
title=dict(
text="<b>Level Metric Breakdown</b>",
x=0.5,
y=0.98,
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
)
)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
return fig
def create_empty_radar_chart(message):
"""Create an empty radar chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ“Š {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1450,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
title=dict(
text="<b>Core Capability Radar</b>",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="white",
weight=700
),
),
annotations=[
dict(
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
# NEW VISUALIZATION FUNCTIONS
def create_cost_performance_scatter(df, metric="Avg AC"):
"""Create scatter plot showing cost vs performance efficiency"""
# Filter out models without cost or performance data
df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
label_map = {
'Proprietary': 'API',
'Open source': 'OSS'
}
if df_filtered.empty:
return create_empty_chart("No data available for cost-performance analysis")
# Convert to numeric
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
# Create color mapping for model type
color_map = {
'Proprietary': '#1098F7', # Airglow Blue for Proprietary
'Open source': '#58BC82' # Green for Open source
}
df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
fig = go.Figure()
# Add scatter points
for model_type in df_filtered['Model Type'].unique():
df_type = df_filtered[df_filtered['Model Type'] == model_type]
legend_name = label_map.get(model_type, model_type)
fig.add_trace(go.Scatter(
x=df_type[metric],
y=df_type['Avg Total Cost'],
mode='markers+text',
name=legend_name,
text=df_type['Model'],
textposition="top center",
textfont=dict(size=10, color='white'),
marker=dict(
size=df_type['Avg Turns'] * 3, # Size based on number of turns
color=color_map.get(model_type, '#F5F6F7'),
opacity=0.8,
line=dict(width=2, color='#01091A')
),
hovertemplate="<b>%{text}</b><br>" +
f"{metric}: %{{x:.3f}}<br>" +
"Cost: $%{y:.3f}<br>" +
"Turns: %{marker.size:.1f}<br>" +
"<extra></extra>"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Total Cost'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="πŸ’Ž High Performance<br>Low Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)")
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance<br>High Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Cost-Performance Efficiency: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display}</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="<b>Average Session Cost ($)</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
font=dict(size=12, family="'Geist', sans-serif", color='white'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1
),
margin=dict(t=100, b=80, l=80, r=80)
)
return fig
def create_speed_accuracy_plot(df, metric="Avg AC"):
"""Create scatter plot showing speed vs accuracy trade-off"""
# Filter out models without duration or performance data
df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for speed-accuracy analysis")
# Convert to numeric
df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
# Create color scale based on cost
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
fig = go.Figure()
# Add scatter trace
fig.add_trace(go.Scatter(
x=df_filtered[metric],
y=df_filtered['Avg Session Duration'],
mode='markers+text',
text=df_filtered['Model'],
textposition="top center",
textfont=dict(size=9, color='white'),
marker=dict(
size=12,
color=df_filtered['Avg Total Cost'],
colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']],
showscale=True,
colorbar=dict(
title=dict(
text="Cost ($)",
font=dict(color="white")
),
tickfont=dict(color="white"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
x=1.02
),
line=dict(width=2, color='#01091A')
),
hovertemplate="<b>%{text}</b><br>" +
f"{metric}: %{{x:.3f}}<br>" +
"Duration: %{y:.1f}s<br>" +
"Cost: $%{marker.color:.3f}<br>" +
"<extra></extra>"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Session Duration'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="⚑ Fast & Accurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="white", weight=600))
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#ffd21e", weight=600))
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display}</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="<b>Average Session Duration (seconds)</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=120)
)
return fig
def create_domain_specialization_matrix(df, metric_type="AC"):
"""Create bubble chart showing domain specialization"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Prepare data
data = []
for _, model in df.iterrows():
if model['Model'] == '':
continue
model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
if pd.isna(model_avg):
continue
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in model and model[domain_col] != '':
domain_val = pd.to_numeric(model[domain_col], errors='coerce')
if not pd.isna(domain_val):
# Calculate specialization strength (deviation from model average)
specialization = domain_val - model_avg
data.append({
'Model': model['Model'],
'Domain': domain,
'Performance': domain_val,
'Specialization': specialization,
'Model Type': model['Model Type']
})
if not data:
return create_empty_chart("No domain specialization data available")
df_plot = pd.DataFrame(data)
# Create bubble chart
fig = go.Figure()
# Color based on specialization strength
fig.add_trace(go.Scatter(
x=df_plot['Domain'],
y=df_plot['Model'],
mode='markers',
marker=dict(
size=df_plot['Performance'] * 30, # Size based on absolute performance
color=df_plot['Specialization'],
colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']],
showscale=True,
colorbar=dict(
title=dict(
text="Specialization<br>Strength",
font=dict(color="white")
),
tickfont=dict(color="white"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
),
line=dict(width=2, color='#01091A'),
opacity=0.8
),
text=[f"Performance: {p:.3f}<br>Specialization: {s:+.3f}"
for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
hovertemplate="<b>%{y}</b><br>" +
"Domain: %{x}<br>" +
"%{text}<br>" +
"<extra></extra>"
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Domain Specialization Matrix: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text="<b>Business Domains</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=13, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
yaxis=dict(
title=dict(
text="<b>Models</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=11, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1100,
width=1450,
margin=dict(t=100, b=80, l=220, r=120)
)
return fig
def create_performance_gap_analysis(df, metric_type="AC"):
"""Create range plot showing performance gaps by domain"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Calculate min, max, median for each domain
gap_data = []
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in df.columns:
domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
if len(domain_values) > 0:
gap_data.append({
'Domain': domain,
'Min': domain_values.min(),
'Max': domain_values.max(),
'Median': domain_values.median(),
'Q1': domain_values.quantile(0.25),
'Q3': domain_values.quantile(0.75),
'Gap': domain_values.max() - domain_values.min()
})
if not gap_data:
return create_empty_chart("No data available for gap analysis")
df_gap = pd.DataFrame(gap_data)
df_gap = df_gap.sort_values('Gap', ascending=True)
fig = go.Figure()
# Add range bars
for idx, row in df_gap.iterrows():
# Add full range line
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='lines',
line=dict(color='#64748B', width=2),
showlegend=False,
hoverinfo='skip'
))
# Add IQR box
fig.add_trace(go.Scatter(
x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
fill='toself',
fillcolor='rgba(255, 210, 30, 0.3)',
line=dict(color='#ffd21e', width=2),
showlegend=False,
hoverinfo='skip',
mode='lines'
))
# Add median marker
fig.add_trace(go.Scatter(
x=[row['Median']],
y=[row['Domain']],
mode='markers',
marker=dict(
size=12,
color='#ffd21e',
symbol='diamond',
line=dict(width=2, color='#01091A')
),
showlegend=False,
hovertemplate=f"<b>{row['Domain']}</b><br>" +
f"Min: {row['Min']:.3f}<br>" +
f"Q1: {row['Q1']:.3f}<br>" +
f"Median: {row['Median']:.3f}<br>" +
f"Q3: {row['Q3']:.3f}<br>" +
f"Max: {row['Max']:.3f}<br>" +
f"Gap: {row['Gap']:.3f}<br>" +
"<extra></extra>"
))
# Add min/max points
for idx, row in df_gap.iterrows():
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='markers',
marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')),
showlegend=False,
hoverinfo='skip'
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display} Score</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
),
yaxis=dict(
title=dict(
text="<b>Business Domain</b>",
font=dict(size=16, color="white")
),
tickfont=dict(size=13, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=1450,
margin=dict(t=100, b=80, l=140, r=80),
showlegend=False
)
# Add legend manually
fig.add_annotation(
text="β—† Median ━ IQR ─ Full Range",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=12, color='white'),
showarrow=False
)
return fig
def create_empty_chart(message):
"""Create an empty chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ“Š {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1450,
margin=dict(t=80, b=80, l=80, r=80)
)