"""
Agent Leaderboard v1 - Main leaderboard interface
Updated implementation with LLM Type support and optimized radar charts
"""
import base64
import math
import re
from datetime import datetime
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
# Import components and styles from modular files
from components.leaderboard_components import (
get_chart_colors, get_rank_badge, get_type_badge,
get_metric_tooltip, get_responsive_styles, get_faq_section
)
from styles.leaderboard_styles import get_leaderboard_css
ASSET_ICON_PATH = Path("krew_icon.png")
KREW_ICON_BASE64 = ""
if ASSET_ICON_PATH.exists():
KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8")
CSV_PATH = Path("combined_evaluation_summary.csv")
if CSV_PATH.exists():
EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d")
else:
EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d")
def create_leaderboard_v2_tab():
"""Create the main leaderboard v1 tab with interactive table"""
token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens)
tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling
level_ids = [f"L{i}" for i in range(1, 8)]
level_tsq_sources = {
"L1": "L1_ArgAcc",
"L2": "L2_SelectAcc",
"L3": "L3_PSM",
"L4": "L4_Coverage",
"L5": "L5_AdaptiveRoutingScore",
"L6": "L6_EffScore",
"L7": "L7_ContextRetention",
}
def load_leaderboard_data():
"""Load and prepare the leaderboard data"""
df = pd.read_csv('combined_evaluation_summary.csv')
# Clean and prepare data
df = df.copy()
numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')]
for col in numeric_candidate_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Derive per-level helper columns for cost and turns
sr_columns = []
tsq_columns = []
duration_columns = []
cost_columns = []
turns_columns = []
for level in level_ids:
sr_col = f"{level}_SR"
if sr_col in df.columns:
sr_columns.append(sr_col)
df[sr_col] = df[sr_col].round(3)
tsq_source = level_tsq_sources.get(level)
if tsq_source and tsq_source in df.columns:
tsq_columns.append(tsq_source)
duration_col = f"{level}_Avg_Exec_Time"
if duration_col in df.columns:
duration_columns.append(duration_col)
token_col = f"{level}_Avg_Tokens"
if token_col in df.columns:
cost_col = f"{level}_Avg_Cost"
turns_col = f"{level}_Avg_Turns"
df[cost_col] = df[token_col] * token_to_cost_factor
df[turns_col] = df[token_col] / tokens_per_turn
cost_columns.append(cost_col)
turns_columns.append(turns_col)
if sr_columns:
df['Avg AC'] = df[sr_columns].mean(axis=1)
if tsq_columns:
df['Avg TSQ'] = df[tsq_columns].mean(axis=1)
if cost_columns:
df['Avg Total Cost'] = df[cost_columns].mean(axis=1)
if duration_columns:
df['Avg Session Duration'] = df[duration_columns].mean(axis=1)
if turns_columns:
df['Avg Turns'] = df[turns_columns].mean(axis=1)
# Derive core capability metrics for radar visualization
if sr_columns:
df['Overall Success'] = df[sr_columns].mean(axis=1)
execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns]
if execution_cols:
df['Execution Accuracy'] = df[execution_cols].mean(axis=1)
reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns]
if reasoning_cols:
df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1)
robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns]
if robustness_cols:
df['Robustness'] = df[robustness_cols].mean(axis=1)
context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns]
if context_cols:
df['Context & Efficiency'] = df[context_cols].mean(axis=1)
epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns]
if epr_cols:
df['Call Validity'] = df[epr_cols].mean(axis=1)
# Use LLM Type from CSV directly, with mapping to display names
if 'LLM Type' in df.columns:
# Clean the LLM Type column to remove any whitespace
df['LLM Type'] = df['LLM Type'].astype(str).str.strip()
# Map LLM Type to Model Type
def map_llm_type(llm_type):
if llm_type.upper() == "OSS":
return "Open source"
else:
return "Proprietary"
df['Model Type'] = df['LLM Type'].apply(map_llm_type)
else:
# Fallback to vendor mapping if LLM Type column doesn't exist
vendor_model_type_map = {
"OpenAI": "Proprietary",
"Anthropic": "Proprietary",
"Google": "Proprietary",
"Microsoft": "Proprietary",
"Mistral": "Proprietary",
"Databricks": "Open source",
"Meta": "Open source",
"Alibaba": "Open source",
"알리바바": "Open source", # Korean name for Alibaba
"Kakao": "Open source",
"SKT": "Open source",
"KT": "Open source",
"xAI": "Proprietary",
}
df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary')
# Round numeric columns for better display
round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity']
round_one_cols = ['Avg Session Duration', 'Avg Turns']
for col in round_three_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
for col in round_one_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(1)
if cost_columns:
df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3)
if turns_columns:
df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2)
if duration_columns:
df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2)
# Fill NaN values appropriately
df = df.fillna('')
return df
def build_static_radar_chart(values, labels):
"""Render a small static radar chart as inline SVG"""
if not values or all(v == 0 for v in values):
return """
레이더 차트
기초 수행력 · 복합 추론력 · 견고성 · 맥락 효율성 · 전반적 성공률 · 기본적 유효성
"""
size = 220
center = size / 2
radius = size * 0.38
n = len(values)
def point(v, idx, scale=1.0):
angle = (2 * math.pi * idx / n) - math.pi / 2
r = radius * v * scale
x = center + r * math.cos(angle)
y = center + r * math.sin(angle)
return x, y
polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values)))
ring_polygons = []
for step in (0.33, 0.66, 1.0):
ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n)))
opacity = 0.04 if step < 1.0 else 0.08
ring_polygons.append(f'')
axis_lines = "\n".join(
f''
for idx in range(n)
)
label_spans = "\n".join(
f'{label}'
for idx, label in enumerate(labels)
)
svg = f"""
"""
return svg
# Level metadata for the 7-stage task framework
level_details = {
"ALL": {
"title": "ALL · 전체 태스크",
"description": "7개의 태스크 전반의 평균 성능을 한눈에 살펴보고 각 레벨 비교를 위한 기준점을 제공합니다."
},
"L1": {
"title": "L1 · 단일 도구 실행",
"description": "단일 도구 실행 능력과 기본적인 명령 수행 정확도를 평가합니다."
},
"L2": {
"title": "L2 · 도구 선택 능력",
"description": "요구 사항에 맞는 도구를 고르고 적절한 파라미터로 호출하는 능력을 측정합니다."
},
"L3": {
"title": "L3 · 순차적 추론 (Chaining)",
"description": "복수 단계의 순차적 reasoning을 통해 문제를 해결하는 과정을 검증합니다."
},
"L4": {
"title": "L4 · 병렬적 추론 (Aggregation)",
"description": "여러 소스의 정보를 병렬적으로 통합하고 요약하는 능력을 평가합니다."
},
"L5": {
"title": "L5 · 강건성 (Robustness / Fallback)",
"description": "예상치 못한 오류나 실패 상황에 대한 인지와 대응 전략을 확인합니다."
},
"L6": {
"title": "L6 · 효율성 (Efficiency)",
"description": "최소한의 호출과 비용으로 목표를 달성하는 운영 효율을 살펴봅니다."
},
"L7": {
"title": "L7 · 장기 컨텍스트 기억 (Contextual Memory)",
"description": "장기 대화 맥락을 유지하고 적절히 활용하는 능력을 집중적으로 분석합니다."
}
}
default_level = "ALL"
sr_column_map = {level: f"{level}_SR" for level in level_ids}
overall_sort_column = "Overall Success"
def resolve_level(level_value):
"""Normalize the incoming level filter value"""
if not level_value:
return default_level
return level_value if level_value in level_details else default_level
def generate_html_table(filtered_df, highlight_column):
"""Generate styled HTML table with per-level success rates"""
valid_highlights = list(sr_column_map.values()) + ["Overall Success"]
highlight_column = highlight_column if highlight_column in valid_highlights else None
overall_column = "Overall Success"
overall_highlight = (highlight_column == overall_column)
highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids}
table_html = """
Rank
Model
Vendor
LLM Type
"""
overall_header_classes = ["numeric-cell"]
if overall_highlight:
overall_header_classes.append("highlight-header")
table_html += f"""
Overall ⓘ
"""
for level in level_ids:
header_classes = ["numeric-cell"]
if highlight_map.get(level):
header_classes.append("highlight-header")
table_html += f"""
{level} ⓘ
"""
table_html += """
"""
def safe_float(value):
if value is None:
return ''
if isinstance(value, str) and value.strip() == '':
return ''
if pd.isna(value):
return ''
try:
return float(value)
except (TypeError, ValueError):
return ''
# Generate table rows
for idx, (_, row) in enumerate(filtered_df.iterrows()):
rank = idx + 1
table_html += f"""
{get_rank_badge(rank)}
{row['Model']}
{row['Vendor']}
{get_type_badge(row['Model Type'])}
"""
overall_value = safe_float(row.get(overall_column, ''))
if overall_value != '':
overall_display = f'{overall_value:.3f}'
else:
overall_display = '-'
overall_classes = ["numeric-cell"]
if overall_highlight:
overall_classes.append("highlight-cell")
table_html += f'
{overall_display}
'
for level in level_ids:
sr_col = sr_column_map[level]
value = safe_float(row.get(sr_col, ''))
if value != '':
value_display = f'{value:.3f}'
else:
value_display = '-'
cell_classes = ["numeric-cell"]
if highlight_map.get(level):
cell_classes.append("highlight-cell")
table_html += f'
{value_display}
'
table_html += "
"
table_html += """
"""
return table_html
def update_leaderboard_title(level_filter):
"""Update the leaderboard title based on selected level"""
level_key = resolve_level(level_filter)
level_info = level_details.get(level_key, level_details[default_level])
level_title = level_info["title"]
level_description = level_info["description"]
return f"""
Agent Leaderboard · {level_title}
{level_description}
"""
model_type_lookup = {
"OSS": "Open source",
"API": "Proprietary"
}
def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"):
"""Apply shared filters and sorting to the leaderboard dataframe."""
filtered_df = df.copy()
level_key = resolve_level(level_filter)
highlight_column = None
if model_type_filter != "All":
mapped_type = model_type_lookup.get(model_type_filter, model_type_filter)
filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type]
actual_sort_column = sort_by if sort_by in filtered_df.columns else None
if not actual_sort_column:
if level_key == "ALL":
actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None
else:
actual_sort_column = sr_column_map.get(level_key)
if level_key in sr_column_map:
highlight_column = sr_column_map[level_key]
elif level_key == "ALL" and overall_sort_column in filtered_df.columns:
highlight_column = overall_sort_column
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
return filtered_df, level_key, highlight_column
def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order):
"""Filter and sort the leaderboard data"""
df = load_leaderboard_data()
filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by)
# Generate HTML table
return generate_html_table(filtered_df, highlight_column)
# Load initial data
initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
initial_df = load_leaderboard_data() # Load raw data for model selector
initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
initial_level_metric_level = level_ids[0] if level_ids else None
initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else []
initial_level_model_values = initial_level_model_choices[:5]
initial_level_metric_chart = create_level_metric_chart(
initial_df,
initial_level_metric_level,
initial_level_model_values
) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available")
# Load custom CSS and responsive styles
custom_css = get_leaderboard_css() + get_responsive_styles() + """
"""
gr.HTML(custom_css)
# Header styles and navigation
gr.HTML("""
""")
gr.Image(
value="banner.png",
show_label=False,
interactive=False,
type="filepath",
elem_id="hero-banner"
)
gr.HTML("""
""")
# Update functions
def get_optimal_sort_order(sort_by_value):
"""Return the optimal sort order for a given metric"""
# Metrics where higher is better (descending)
descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
# Metrics where lower is better (ascending)
ascending_metrics = []
if sort_by_value in descending_metrics:
return "Descending"
elif sort_by_value in ascending_metrics:
return "Ascending"
else:
return "Descending" # Default fallback
def update_table(level_filter, model_type_filter, sort_order):
title_html = update_leaderboard_title(level_filter)
sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
return title_html, table_html
def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
# Update model selector choices based on filtered data
available_models_all = filtered_df['Model'].tolist()
available_models = available_models_all[:15] # Top 15 from filtered results
# If selected models are not in available models, reset to top 5
if selected_models:
valid_selected = [m for m in selected_models if m in available_models]
if not valid_selected:
valid_selected = available_models[:5]
else:
valid_selected = available_models[:5]
# Create radar chart
chart = create_domain_radar_chart(filtered_df, valid_selected)
# Prepare heatmap order prioritizing selected models
heatmap_order = []
for model in valid_selected:
if model not in heatmap_order:
heatmap_order.append(model)
for model in available_models_all:
if model not in heatmap_order:
heatmap_order.append(model)
heatmap_order = heatmap_order[:12]
heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order)
# Level metric chart
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown"]
),
chart,
heatmap_fig,
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models_all = filtered_df['Model'].tolist()
if selected_models:
valid_selected = [m for m in selected_models if m in available_models_all]
if not valid_selected:
valid_selected = available_models_all[:5]
else:
valid_selected = available_models_all[:5]
heatmap_order = []
for model in valid_selected:
if model not in heatmap_order:
heatmap_order.append(model)
for model in available_models_all:
if model not in heatmap_order:
heatmap_order.append(model)
heatmap_order = heatmap_order[:12]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
create_domain_radar_chart(filtered_df, valid_selected),
create_performance_heatmap(filtered_df, heatmap_order),
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models = filtered_df['Model'].tolist()
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_models][:5]
if not valid_level_models:
valid_level_models = available_models[:5]
else:
valid_level_models = available_models[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_chart,
)
# Update table when filters change
filter_inputs = [domain_filter, model_type_filter, sort_order]
for input_component in filter_inputs:
input_component.change(
fn=update_table,
inputs=filter_inputs,
outputs=[leaderboard_title, leaderboard_table]
)
# Also update radar chart when filters change
input_component.change(
fn=update_radar_chart,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
)
# Update radar chart when model selection changes
model_selector.change(
fn=update_radar_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
)
level_metric_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
level_model_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
# Define generate_performance_card function before using it
def generate_performance_card(model_name):
"""Generate HTML for the model performance card"""
if not model_name:
return """
Please select a model to generate its performance card
"""
# Get model data
df = load_leaderboard_data()
model_data = df[df['Model'] == model_name]
if model_data.empty:
return """
Model not found in the database
"""
row = model_data.iloc[0]
# Get overall rank based on overall success
df_with_success = df.copy()
df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
try:
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
except:
rank = 'N/A'
# Format values
def format_value(val, decimals=3, prefix='', suffix=''):
if pd.isna(val) or val == '':
return 'N/A'
return f"{prefix}{float(val):.{decimals}f}{suffix}"
def format_score(value):
if pd.isna(value) or value == '':
return 'N/A'
return f"{float(value):.3f}"
radar_metrics = [
("기초 수행력", row.get('Execution Accuracy')),
("복합 추론력", row.get('Complex Reasoning')),
("견고성", row.get('Robustness')),
("맥락 효율성", row.get('Context & Efficiency')),
("전반적 성공률", row.get('Overall Success')),
("기본적 유효성", row.get('Call Validity')),
]
radar_values = []
radar_labels = []
for label, value in radar_metrics:
if pd.isna(value) or value == '':
radar_values.append(0.0)
else:
try:
radar_values.append(max(0.0, min(1.0, float(value))))
except (TypeError, ValueError):
radar_values.append(0.0)
radar_labels.append(label)
mini_radar_html = build_static_radar_chart(radar_values, radar_labels)
level_blocks = []
for level in level_ids:
sr_col = sr_column_map.get(level)
level_blocks.append((level, row.get(sr_col, '')))
evaluation_date = EVALUATION_DATE
icon_html = ""
if KREW_ICON_BASE64:
icon_html = f''
else:
icon_html = '