diff --git "a/tabs/leaderboard_v1_en.py" "b/tabs/leaderboard_v1_en.py" new file mode 100644--- /dev/null +++ "b/tabs/leaderboard_v1_en.py" @@ -0,0 +1,4114 @@ +""" +Agent Leaderboard v1 - Main leaderboard interface +Updated implementation with LLM Type support and optimized radar charts +""" + +import base64 +import math +import re +from datetime import datetime +from pathlib import Path + +import gradio as gr +import pandas as pd +import plotly.graph_objects as go + +# Import components and styles from modular files +from components.leaderboard_components import ( + get_chart_colors, get_rank_badge, get_type_badge, + get_metric_tooltip, get_responsive_styles, get_faq_section +) +from styles.leaderboard_styles import get_leaderboard_css + +ASSET_ICON_PATH = Path("krew_icon.png") +KREW_ICON_BASE64 = "" +if ASSET_ICON_PATH.exists(): + KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8") + +CSV_PATH = Path("combined_evaluation_summary.csv") +if CSV_PATH.exists(): + EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d") +else: + EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d") + + +def create_leaderboard_v2_tab(): + """Create the main leaderboard v1 tab with interactive table""" + token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens) + tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling + level_ids = [f"L{i}" for i in range(1, 8)] + level_tsq_sources = { + "L1": "L1_ArgAcc", + "L2": "L2_SelectAcc", + "L3": "L3_PSM", + "L4": "L4_Coverage", + "L5": "L5_AdaptiveRoutingScore", + "L6": "L6_EffScore", + "L7": "L7_ContextRetention", + } + + def load_leaderboard_data(): + """Load and prepare the leaderboard data""" + df = pd.read_csv('combined_evaluation_summary.csv') + + # Clean and prepare data + df = df.copy() + numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')] + for col in numeric_candidate_cols: + df[col] = pd.to_numeric(df[col], errors='coerce') + + # Derive per-level helper columns for cost and turns + sr_columns = [] + tsq_columns = [] + duration_columns = [] + cost_columns = [] + turns_columns = [] + + for level in level_ids: + sr_col = f"{level}_SR" + if sr_col in df.columns: + sr_columns.append(sr_col) + df[sr_col] = df[sr_col].round(3) + + tsq_source = level_tsq_sources.get(level) + if tsq_source and tsq_source in df.columns: + tsq_columns.append(tsq_source) + + duration_col = f"{level}_Avg_Exec_Time" + if duration_col in df.columns: + duration_columns.append(duration_col) + + token_col = f"{level}_Avg_Tokens" + if token_col in df.columns: + cost_col = f"{level}_Avg_Cost" + turns_col = f"{level}_Avg_Turns" + df[cost_col] = df[token_col] * token_to_cost_factor + df[turns_col] = df[token_col] / tokens_per_turn + cost_columns.append(cost_col) + turns_columns.append(turns_col) + + if sr_columns: + df['Avg AC'] = df[sr_columns].mean(axis=1) + if tsq_columns: + df['Avg TSQ'] = df[tsq_columns].mean(axis=1) + if cost_columns: + df['Avg Total Cost'] = df[cost_columns].mean(axis=1) + if duration_columns: + df['Avg Session Duration'] = df[duration_columns].mean(axis=1) + if turns_columns: + df['Avg Turns'] = df[turns_columns].mean(axis=1) + + # Derive core capability metrics for radar visualization + if sr_columns: + df['Overall Success'] = df[sr_columns].mean(axis=1) + execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns] + if execution_cols: + df['Execution Accuracy'] = df[execution_cols].mean(axis=1) + reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns] + if reasoning_cols: + df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1) + robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns] + if robustness_cols: + df['Robustness'] = df[robustness_cols].mean(axis=1) + context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns] + if context_cols: + df['Context & Efficiency'] = df[context_cols].mean(axis=1) + epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns] + if epr_cols: + df['Call Validity'] = df[epr_cols].mean(axis=1) + + # Use LLM Type from CSV directly, with mapping to display names + if 'LLM Type' in df.columns: + # Clean the LLM Type column to remove any whitespace + df['LLM Type'] = df['LLM Type'].astype(str).str.strip() + + # Map LLM Type to Model Type + def map_llm_type(llm_type): + if llm_type.upper() == "OSS": + return "Open source" + else: + return "Proprietary" + + df['Model Type'] = df['LLM Type'].apply(map_llm_type) + else: + # Fallback to vendor mapping if LLM Type column doesn't exist + vendor_model_type_map = { + "OpenAI": "Proprietary", + "Anthropic": "Proprietary", + "Google": "Proprietary", + "Microsoft": "Proprietary", + "Mistral": "Proprietary", + "Databricks": "Open source", + "Meta": "Open source", + "Alibaba": "Open source", + "알리바바": "Open source", # Korean name for Alibaba + "Kakao": "Open source", + "SKT": "Open source", + "KT": "Open source", + "xAI": "Proprietary", + } + df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary') + + # Round numeric columns for better display + round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy', + 'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity'] + round_one_cols = ['Avg Session Duration', 'Avg Turns'] + for col in round_three_cols: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce').round(3) + for col in round_one_cols: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce').round(1) + if cost_columns: + df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3) + if turns_columns: + df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2) + if duration_columns: + df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2) + + # Fill NaN values appropriately + df = df.fillna('') + + return df + + def build_static_radar_chart(values, labels): + """Render a small static radar chart as inline SVG""" + if not values or all(v == 0 for v in values): + return """ +
+ Radar Chart + Execution Accuracy · Complex Reasoning · Robustness · Context & Efficiency · Overall Success · Validity +
+ """ + size = 220 + center = size / 2 + radius = size * 0.38 + n = len(values) + def point(v, idx, scale=1.0): + angle = (2 * math.pi * idx / n) - math.pi / 2 + r = radius * v * scale + x = center + r * math.cos(angle) + y = center + r * math.sin(angle) + return x, y + polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values))) + ring_polygons = [] + for step in (0.33, 0.66, 1.0): + ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n))) + opacity = 0.04 if step < 1.0 else 0.08 + ring_polygons.append(f'') + axis_lines = "\n".join( + f'' + for idx in range(n) + ) + label_spans = "\n".join( + f'{label}' + for idx, label in enumerate(labels) + ) + svg = f""" + + + + + + + + + {''.join(ring_polygons)} + {axis_lines} + + {label_spans} + + """ + return svg + + # Level metadata for the 7-stage task framework + level_details = { + "ALL": { + "title": "ALL · All Tasks", + "description": "Compare overall performance levels and stage-specific strengths of models through average SR across L1~L7 levels." + }, + "L1": { + "title": ">L1 · Single Tool Execution", + "description": "Evaluates single tool execution capability and basic command performance accuracy." + }, + "L2": { + "title": "L2 · Tool Selection Capability", + "description": "Measures the ability to select appropriate tools and invoke them with proper parameters." + }, + "L3": { + "title": "L3 · Sequential Reasoning", + "description": "복수 단계의 순차적 추론을 통해 문제를 해결하는 과정을 검증합니다." + }, + "L4": { + "title": "L4 · Parallel Reasoning", + "description": "Evaluates the ability to integrate and summarize information from multiple sources in parallel." + }, + "L5": { + "title": "L5 · Robustness (Robustness / Fallback)", + "description": "Confirms recognition and response strategies for unexpected errors or failure situations." + }, + "L6": { + "title": "L6 · Efficiency (Efficiency)", + "description": "Examines operational efficiency in achieving goals with minimal calls and costs." + }, + "L7": { + "title": "L7 · Long-term Context Memory (Contextual Memory)", + "description": "Intensively analyzes the ability to maintain and appropriately utilize long-term conversation context." + } + } + default_level = "ALL" + + sr_column_map = {level: f"{level}_SR" for level in level_ids} + overall_sort_column = "Overall Success" + + def resolve_level(level_value): + """Normalize the incoming level filter value""" + if not level_value: + return default_level + return level_value if level_value in level_details else default_level + + def generate_html_table(filtered_df, highlight_column): + """Generate styled HTML table with per-level success rates""" + valid_highlights = list(sr_column_map.values()) + ["Overall Success"] + highlight_column = highlight_column if highlight_column in valid_highlights else None + overall_column = "Overall Success" + overall_highlight = (highlight_column == overall_column) + highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids} + + table_html = """ + + +
+ + + + + + + + """ + overall_header_classes = ["numeric-cell"] + if overall_highlight: + overall_header_classes.append("highlight-header") + table_html += f""" + + """ + for level in level_ids: + header_classes = ["numeric-cell"] + if highlight_map.get(level): + header_classes.append("highlight-header") + table_html += f""" + + """ + table_html += """ + + + + """ + def safe_float(value): + if value is None: + return '' + if isinstance(value, str) and value.strip() == '': + return '' + if pd.isna(value): + return '' + try: + return float(value) + except (TypeError, ValueError): + return '' + + # Generate table rows + for idx, (_, row) in enumerate(filtered_df.iterrows()): + rank = idx + 1 + table_html += f""" + + + + + + """ + overall_value = safe_float(row.get(overall_column, '')) + if overall_value != '': + overall_display = f'{overall_value:.3f}' + else: + overall_display = '-' + overall_classes = ["numeric-cell"] + if overall_highlight: + overall_classes.append("highlight-cell") + table_html += f'' + for level in level_ids: + sr_col = sr_column_map[level] + value = safe_float(row.get(sr_col, '')) + if value != '': + value_display = f'{value:.3f}' + else: + value_display = '-' + cell_classes = ["numeric-cell"] + if highlight_map.get(level): + cell_classes.append("highlight-cell") + table_html += f'' + table_html += "" + + table_html += """ + +
RankModelVendorLLM Type + Overall + + {level} +
{get_rank_badge(rank)}{row['Model']}{row['Vendor']}{get_type_badge(row['Model Type'])}{overall_display}{value_display}
+
+ """ + + return table_html + + def update_leaderboard_title(level_filter): + """Update the leaderboard title based on selected level""" + level_key = resolve_level(level_filter) + level_info = level_details.get(level_key, level_details[default_level]) + level_title = level_info["title"] + level_description = level_info["description"] + + return f""" +
+
+

Agent Leaderboard · {level_title}

+

{level_description}

+
+
+ """ + + model_type_lookup = { + "OSS": "Open source", + "API": "Proprietary" + } + + def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"): + """Apply shared filters and sorting to the leaderboard dataframe.""" + filtered_df = df.copy() + level_key = resolve_level(level_filter) + highlight_column = None + + if model_type_filter != "All": + mapped_type = model_type_lookup.get(model_type_filter, model_type_filter) + filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type] + + actual_sort_column = sort_by if sort_by in filtered_df.columns else None + if not actual_sort_column: + if level_key == "ALL": + actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None + else: + actual_sort_column = sr_column_map.get(level_key) + + if level_key in sr_column_map: + highlight_column = sr_column_map[level_key] + elif level_key == "ALL" and overall_sort_column in filtered_df.columns: + highlight_column = overall_sort_column + + if actual_sort_column and actual_sort_column in filtered_df.columns: + ascending = (sort_order == "Ascending") + filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') + + return filtered_df, level_key, highlight_column + + def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order): + """Filter and sort the leaderboard data""" + df = load_leaderboard_data() + + filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by) + + # Generate HTML table + return generate_html_table(filtered_df, highlight_column) + + # Load initial data + initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending") + initial_df = load_leaderboard_data() # Load raw data for model selector + initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else [] + initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else [] + initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models) + initial_level_metric_level = level_ids[0] if level_ids else None + initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else [] + initial_level_model_values = initial_level_model_choices[:5] + initial_level_metric_chart = create_level_metric_chart( + initial_df, + initial_level_metric_level, + initial_level_model_values + ) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available") + + # Load custom CSS and responsive styles + custom_css = get_leaderboard_css() + get_responsive_styles() + """ + + + + """ + + gr.HTML(custom_css) + + # Header styles and navigation + gr.HTML(""" + + """) + + gr.Image( + value="banner.png", + show_label=False, + interactive=False, + type="filepath", + elem_id="hero-banner" + ) + + gr.HTML(""" +
+

Hugging Face KREW Ko-AgentBench

+

Agent Benchmark Specialized for Korean Service Environment

+
+ """) + + # Links section below title + gr.HTML(""" + + """) + + # Section 1: 단계별 태스크 설계 + gr.HTML(""" +
+
+

7-Level Task Structure

+
+

From simple tool calls to long-term context understanding and robustness handling,

+

we analyzed agent capabilities in 3D across 7 levels.

+
+
+

Single-Turn

+
+ 80% +
+
    +
  • L1: Single Tool Execution
  • +
  • L2: Tool Selection Capability
  • +
  • L3: Sequential Reasoning
  • +
  • L4: Parallel Reasoning
  • +
  • L5: Robustness
  • +
+
+
+

Multi-Turn

+
+ 20% +
+
    +
  • L6: Efficiency
  • +
  • L7: Long-term Context Memory
  • +
+
+
+
+ """) + + # Section 2: 핵심 시나리오 구성 + gr.HTML(""" +
+
+

Real-life Scenario Design Using 18 APIs Optimized for Domestic Environment

+
+
+

Realistic, user-centered scenarios—such as “appointment booking” and “blog review search”—were designed

+

by integrating major domestic service APIs including Naver Maps and Kakao.

+
+
+
+ """) + + # Section 3: 핵심 평가 기준 + gr.HTML(""" +
+
+

Key Evaluation Metrics

+
+
+
+

Cache-based Iterative Evaluation

+
    +
  • Real API Response Caching
  • +
  • Solves chronic issues of existing benchmarks such as 'external API instability and information attribute mismatch'
  • +
  • Ensures benchmark consistency and reliability
  • +
+
+
+

Robustness Test

+
    +
  • Evaluates error recognition and response capability (strategy) for intentional error situations (product discontinuation)
  • +
  • Selects models that operate stably in real-world environments
  • +
+
+
+

Level-specific Evaluation Metrics

+
    +
  • Evaluates problem-solving efficiency at each stage including tool selection, parameter configuration, and data processing flow
  • +
  • Quantitatively identifies model strengths and weaknesses
  • +
+
+
+
+ """) + + # Metrics overview cards removed per updated design + + # Domain filter section with enhanced styling + gr.HTML(""" + + + """) + + level_options = list(level_details.keys()) + + with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"): + gr.HTML(""" +
+

🧠 Select Task Level

+

Easily compare agent performance across ALL · L1~L7 stages of Ko-AgentBench.

+
+ """) + domain_filter = gr.Radio( + choices=level_options, + value=default_level, + label="", + interactive=True, + container=False, + elem_classes=["domain-radio"] + ) + + # Filter controls with domain styling + with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"): + gr.HTML(""" +
+

🔍 Filters & Sorting

+

Select model type and sorting criteria to explore results in your preferred way.

+
+ """) + with gr.Row(elem_classes=["filters-sorting-row"]): + with gr.Column(scale=1, elem_classes=["filter-group"]): + with gr.Row(elem_classes=["filter-group-row"]): + gr.HTML("Model Access") + model_type_filter = gr.Radio( + choices=["All", "OSS", "API"], + value="All", + label="", + elem_classes=["domain-radio"], + container=False + ) + with gr.Column(scale=1, elem_classes=["filter-group"]): + with gr.Row(elem_classes=["filter-group-row"]): + gr.HTML("Sort Order") + sort_order = gr.Radio( + choices=["Descending", "Ascending"], + value="Descending", + label="", + elem_classes=["domain-radio"], + container=False + ) + + # Main leaderboard table with dynamic title + leaderboard_title = gr.HTML(update_leaderboard_title(default_level)) + + leaderboard_table = gr.HTML(initial_table) + + gr.HTML(""" +
+
""") + + # Radar Chart Section + gr.HTML(""" +
+
+

Core Capability Radar

+

Core Capability Radar

+

#Execution Accuracy #Complex Reasoning #Robustness #Context & Efficiency #Overall Success #Validity

+

Analyze model performance capabilities and balance through 6 core competencies.

+
+ """) + + with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"): + gr.HTML(""" +
+

🎯 Select Models for Comparison

+

Select models to compare in the radar chart.

+
+ """) + model_selector = gr.Dropdown( + choices=initial_df['Model'].tolist()[:10], + value=initial_df['Model'].tolist()[:5], + multiselect=True, + label="", + info=None, + container=False, + elem_classes=["model-dropdown"] + ) + + # Radar chart plot - wrapped in centered container + gr.HTML('
') + radar_chart = gr.Plot( + label="", + value=create_domain_radar_chart( + load_leaderboard_data(), + initial_df['Model'].tolist()[:5] + ), + elem_classes=["radar-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # Level metric breakdown section + gr.HTML(""" +
+
+

Level-Specific Metric Spotlight

+

Compare model scores based on unique evaluation metrics for each L1–L7 level.

+
+ """) + + with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"): + gr.HTML(""" +
+

🧭 Select Task Level and Models

+

Select L1–L7 levels and models to explore detailed SR-based metrics.

+
+ """) + level_metric_selector = gr.Dropdown( + choices=level_ids, + value=level_ids[0] if level_ids else None, + multiselect=False, + label="", + info=None, + container=False, + elem_classes=["level-dropdown"] + ) + level_model_selector = gr.Dropdown( + choices=initial_level_model_choices, + value=initial_level_model_values, + multiselect=True, + label="", + info=None, + container=False, + elem_classes=["model-dropdown", "level-model-dropdown"] + ) + + gr.HTML('
') + level_metric_chart = gr.Plot( + label="", + value=initial_level_metric_chart, + elem_classes=["level-metric-plot", "plot-container"] + ) + gr.HTML(""" +
+
+ """) + + # Heatmap section + gr.HTML(""" +
+
+

Comprehensive Performance Heatmap

+

Explore the comprehensive performance heatmap to see SR scores across L1–L7 levels for each model at a glance.

+
+
+ """) + heatmap_chart = gr.Plot( + label="", + value=initial_heatmap, + elem_classes=["heatmap-plot", "plot-container"] + ) + gr.HTML(""" +
+
+ """) + + # Update functions + def get_optimal_sort_order(sort_by_value): + """Return the optimal sort order for a given metric""" + # Metrics where higher is better (descending) + descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids] + + # Metrics where lower is better (ascending) + ascending_metrics = [] + + if sort_by_value in descending_metrics: + return "Descending" + elif sort_by_value in ascending_metrics: + return "Ascending" + else: + return "Descending" # Default fallback + + def update_table(level_filter, model_type_filter, sort_order): + title_html = update_leaderboard_title(level_filter) + sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success") + table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order) + return title_html, table_html + + def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): + # Get filtered dataframe + df = load_leaderboard_data() + sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") + filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) + + # Update model selector choices based on filtered data + available_models_all = filtered_df['Model'].tolist() + available_models = available_models_all[:15] # Top 15 from filtered results + + # If selected models are not in available models, reset to top 5 + if selected_models: + valid_selected = [m for m in selected_models if m in available_models] + if not valid_selected: + valid_selected = available_models[:5] + else: + valid_selected = available_models[:5] + + # Create radar chart + chart = create_domain_radar_chart(filtered_df, valid_selected) + + # Prepare heatmap order prioritizing selected models + heatmap_order = [] + for model in valid_selected: + if model not in heatmap_order: + heatmap_order.append(model) + for model in available_models_all: + if model not in heatmap_order: + heatmap_order.append(model) + heatmap_order = heatmap_order[:12] + heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order) + + # Level metric chart + effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) + available_level_models = available_models_all + if level_selected_models: + valid_level_models = [m for m in level_selected_models if m in available_level_models][:5] + if not valid_level_models: + valid_level_models = available_level_models[:5] + else: + valid_level_models = available_level_models[:5] + level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") + + return ( + gr.Dropdown( + choices=available_models, + value=valid_selected, + multiselect=True, + label="", + info=None, + container=False, + elem_classes=["model-dropdown"] + ), + chart, + heatmap_fig, + gr.Dropdown( + choices=available_level_models, + value=valid_level_models, + multiselect=True, + label="", + info=None, + container=False, + elem_classes=["model-dropdown", "level-model-dropdown"] + ), + level_metric_fig, + ) + + def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): + # Get filtered dataframe + df = load_leaderboard_data() + sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") + filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) + + available_models_all = filtered_df['Model'].tolist() + if selected_models: + valid_selected = [m for m in selected_models if m in available_models_all] + if not valid_selected: + valid_selected = available_models_all[:5] + else: + valid_selected = available_models_all[:5] + + heatmap_order = [] + for model in valid_selected: + if model not in heatmap_order: + heatmap_order.append(model) + for model in available_models_all: + if model not in heatmap_order: + heatmap_order.append(model) + heatmap_order = heatmap_order[:12] + + effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) + available_level_models = available_models_all + if level_selected_models: + valid_level_models = [m for m in level_selected_models if m in available_level_models][:5] + if not valid_level_models: + valid_level_models = available_level_models[:5] + else: + valid_level_models = available_level_models[:5] + level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") + + return ( + create_domain_radar_chart(filtered_df, valid_selected), + create_performance_heatmap(filtered_df, heatmap_order), + gr.Dropdown( + choices=available_level_models, + value=valid_level_models, + multiselect=True, + label="", + info=None, + container=False, + elem_classes=["model-dropdown", "level-model-dropdown"] + ), + level_metric_fig, + ) + + def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): + df = load_leaderboard_data() + sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") + filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) + available_models = filtered_df['Model'].tolist() + if level_selected_models: + valid_level_models = [m for m in level_selected_models if m in available_models][:5] + if not valid_level_models: + valid_level_models = available_models[:5] + else: + valid_level_models = available_models[:5] + effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) + level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") + return ( + gr.Dropdown( + choices=available_models, + value=valid_level_models, + multiselect=True, + label="", + info=None, + container=False, + elem_classes=["model-dropdown", "level-model-dropdown"] + ), + level_chart, + ) + + # Update table when filters change + filter_inputs = [domain_filter, model_type_filter, sort_order] + + for input_component in filter_inputs: + input_component.change( + fn=update_table, + inputs=filter_inputs, + outputs=[leaderboard_title, leaderboard_table] + ) + + # Also update radar chart when filters change + input_component.change( + fn=update_radar_chart, + inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], + outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart] + ) + + # Update radar chart when model selection changes + model_selector.change( + fn=update_radar_only, + inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], + outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart] + ) + + level_metric_selector.change( + fn=update_level_metric_only, + inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], + outputs=[level_model_selector, level_metric_chart] + ) + + level_model_selector.change( + fn=update_level_metric_only, + inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], + outputs=[level_model_selector, level_metric_chart] + ) + + # Define generate_performance_card function before using it + def generate_performance_card(model_name): + """Generate HTML for the model performance card""" + if not model_name: + return """
+ Please select a model to generate its performance card +
""" + + # Get model data + df = load_leaderboard_data() + model_data = df[df['Model'] == model_name] + + if model_data.empty: + return """
+ Model not found in the database +
""" + + row = model_data.iloc[0] + + # Get overall rank based on overall success + df_with_success = df.copy() + df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce') + df_with_success = df_with_success[df_with_success['Overall Success'].notna()] + df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True) + try: + rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1 + except: + rank = 'N/A' + + # Format values + def format_value(val, decimals=3, prefix='', suffix=''): + if pd.isna(val) or val == '': + return 'N/A' + return f"{prefix}{float(val):.{decimals}f}{suffix}" + + def format_score(value): + if pd.isna(value) or value == '': + return 'N/A' + return f"{float(value):.3f}" + + # Use the same order as the domain radar but keep '견고성' (Robustness) last + radar_metrics = [ + ("Execution Accuracy", row.get('Execution Accuracy')), + ("Context & Efficiency", row.get('Context & Efficiency')), + ("Overall Success", row.get('Overall Success')), + ("Robustness", row.get('Robustness')), + ("Complex Reasoning", row.get('Complex Reasoning')), + ("Validity", row.get('Call Validity')), + ] + radar_values = [] + radar_labels = [] + for label, value in radar_metrics: + if pd.isna(value) or value == '': + radar_values.append(0.0) + else: + try: + radar_values.append(max(0.0, min(1.0, float(value)))) + except (TypeError, ValueError): + radar_values.append(0.0) + radar_labels.append(label) + + mini_radar_html = build_static_radar_chart(radar_values, radar_labels) + + level_blocks = [] + for level in level_ids: + sr_col = sr_column_map.get(level) + level_blocks.append((level, row.get(sr_col, ''))) + + evaluation_date = EVALUATION_DATE + icon_html = "" + if KREW_ICON_BASE64: + icon_html = f'Krew icon' + else: + icon_html = '
🤖
' + + card_html = f""" +
+
+
+
{icon_html}
+
+
{model_name}
+
Vendor · {row['Vendor']}
+
+ Evaluation Date + {evaluation_date} +
+
+
+
+
RANK
+
#{rank}
+
+
+
+
+
+ {mini_radar_html} +
+
+
+ """ + ordered_labels = ["Execution Accuracy", "Context & Efficiency", "Overall Success", "Robustness", "Complex Reasoning", "Validity"] + ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels)) + top_metrics = ordered_metrics[:3] + bottom_metrics = ordered_metrics[3:] + card_html += """ +
+ """ + for label, value in top_metrics: + card_html += f""" +
+
{label}
+
{format_score(value)}
+
+ """ + card_html += """ +
+
+ """ + for label, value in bottom_metrics: + card_html += f""" +
+
{label}
+
{format_score(value)}
+
+ """ + card_html += """ +
+
+
+
+
+ """ + for level, value in level_blocks: + card_html += f""" +
+
{level}
+
{format_score(value)}
+
+ """ + card_html += """ +
+
+
+ """ + + return card_html + + # MODEL PERFORMANCE CARD SECTION + gr.HTML(""" +
+
+

Model Performance Card

+

+ Check out the precision analysis card that visualizes the model's performance spectrum with 6 key metrics and overall success rate (SR) by L1~L7 levels. +

+

+ ※ Rank is calculated based on the average SR value across L1–L7 levels. +

+
+
+ """) + + with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"): + gr.HTML(""" +
+

🤖 Select Model

+

Select models for the analysis card.

+
+ """) + card_model_selector = gr.Dropdown( + choices=initial_df['Model'].tolist(), + value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None, + label="", + info=None, + container=False, + elem_classes=["model-dropdown"] + ) + download_card_btn = gr.Button( + "Download as PNG", + elem_id="download-card-btn", + elem_classes=["pill-button"] + ) + + gr.HTML(""" +
+ """) + + # Card display area - generate initial card + initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None + initial_card_html = generate_performance_card(initial_model) if initial_model else "" + card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html") + + gr.HTML(""" +
+
+
+ """) + + # Add custom CSS for the performance card + gr.HTML(""" + + + """) + + + # Wire up the card generator to selection change + card_model_selector.change( + fn=generate_performance_card, + inputs=[card_model_selector], + outputs=[card_display] + ) + + # Wire up download button with html2canvas capture + download_card_btn.click( + fn=None, + js=""" + async () => { + const ensureHtml2Canvas = () => new Promise((resolve, reject) => { + if (window.html2canvas) { + resolve(window.html2canvas); + return; + } + const existing = document.querySelector('script[data-html2canvas]'); + if (existing) { + existing.addEventListener('load', () => resolve(window.html2canvas)); + existing.addEventListener('error', reject); + return; + } + const script = document.createElement('script'); + script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js'; + script.async = true; + script.dataset.html2canvas = 'true'; + script.onload = () => resolve(window.html2canvas); + script.onerror = () => reject(new Error('Failed to load html2canvas')); + document.head.appendChild(script); + }); + + const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + await pause(60); + + const card = document.querySelector('.performance-card'); + if (!card) { + alert('Performance card not found. Please select a model first.'); + return; + } + + const btn = document.getElementById('download-card-btn'); + const originalText = btn?.textContent || ''; + if (btn) { + btn.textContent = 'Generating...'; + btn.disabled = true; + } + + try { + const html2canvasLib = await ensureHtml2Canvas(); + if (!html2canvasLib) { + throw new Error('html2canvas unavailable'); + } + + const canvas = await html2canvasLib(card, { + backgroundColor: '#01091A', + scale: 2, + logging: false, + useCORS: true + }); + + const link = document.createElement('a'); + const modelName = card.querySelector('.card-model-name')?.textContent || 'model'; + const timestamp = new Date().toISOString().slice(0, 10); + const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`; + link.download = fileName; + link.href = canvas.toDataURL('image/png'); + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + } catch (error) { + console.error('Error capturing card:', error); + alert('Failed to capture performance card. Please try again.'); + } finally { + if (btn) { + btn.textContent = originalText; + btn.disabled = false; + } + } + } + """ + ) + + # Also update card when filters change to keep model selector in sync + for input_component in filter_inputs: + def update_dropdown_and_card(*args): + filtered_df, _, _ = apply_filters( + load_leaderboard_data(), + args[0], + args[1], + args[2], + "Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success") + ) + choices = filtered_df['Model'].tolist() + # Select first model from filtered list + value = choices[0] if choices else None + return gr.Dropdown( + choices=choices, + value=value, + label="", + info=None, + container=False, + elem_classes=["model-dropdown"] + ) + + input_component.change( + fn=update_dropdown_and_card, + inputs=filter_inputs, + outputs=[card_model_selector] + ) + + return leaderboard_table + + +def create_leaderboard_v2_interface(): + """Create the complete leaderboard v1 interface""" + return create_leaderboard_v2_tab() + + +def create_domain_radar_chart(df, selected_models=None, max_models=5): + """Visualize six core capability metrics on a radar chart.""" + df = df.copy() + # Use the same metric order and Korean labels as the model performance card + # Match the model card order but place '견고성' (Robustness) last as requested + metrics_info = [ + {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"}, + {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"}, + {"column": "Overall Success", "label": "Overall Success", "description": "L1~L7의 Average Success Rate"}, + {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"}, + {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"}, + {"column": "Call Validity", "label": "Validity", "description": "레벨별 EPR_CVR 평균"}, + ] + + required_columns = [m["column"] for m in metrics_info] + if df.empty or not any(col in df.columns for col in required_columns): + return create_empty_radar_chart("Not enough data to build the capability radar") + + # Default model selection + if not selected_models: + if "Overall Success" in df.columns: + top_models = df.sort_values("Overall Success", ascending=False) + else: + top_models = df + selected_models = top_models['Model'].head(max_models).tolist() + + selected_models = selected_models[:max_models] + + # Ensure metric columns are numeric + for metric in metrics_info: + col = metric["column"] + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce') + + fig = go.Figure() + angle_labels = [m["label"] for m in metrics_info] + + palette = [ + {'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'}, + {'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'}, + {'fill': 'rgba(249, 112, 185, 0.22)', 'line': '#F970B9'}, + {'fill': 'rgba(139, 92, 246, 0.20)', 'line': '#8B5CF6'}, + {'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'}, + ] + + for idx, model_name in enumerate(selected_models): + model_data = df[df['Model'] == model_name] + if model_data.empty: + continue + + row = model_data.iloc[0] + values = [] + tooltips = [] + for metric in metrics_info: + col = metric["column"] + value = row[col] if col in row else float('nan') + if pd.isna(value) or value == '': + value = 0 + values.append(float(value)) + tooltips.append(metric["description"]) + + if not values: + continue + + values_loop = values + [values[0]] + angles_loop = angle_labels + [angle_labels[0]] + tooltips_loop = tooltips + [tooltips[0]] + colors = palette[idx % len(palette)] + + fig.add_trace( + go.Scatterpolar( + r=values_loop, + theta=angles_loop, + fill='toself', + fillcolor=colors['fill'], + line=dict(color=colors['line'], width=3), + marker=dict( + size=10, + color=colors['line'], + symbol='circle', + line=dict(width=2, color='#01091A') + ), + name=model_name, + customdata=tooltips_loop, + mode="lines+markers", + hovertemplate="%{fullData.name}
" + + "%{theta}
" + + "%{customdata}
" + + "%{r:.3f}
" + + "", + hoverlabel=dict( + bgcolor="rgba(1, 9, 26, 0.95)", + bordercolor=colors['line'], + font=dict(color="white", size=12, family="'Geist', sans-serif") + ) + ) + ) + + tick_vals = [i / 5 for i in range(6)] + tick_text = [f"{val:.2f}" for val in tick_vals] + + fig.update_layout( + polar=dict( + bgcolor='rgba(245, 246, 247, 0.03)', + radialaxis=dict( + visible=True, + range=[0, 1], + showline=True, + linewidth=2, + linecolor='rgba(245, 246, 247, 0.2)', + gridcolor='rgba(245, 246, 247, 0.1)', + gridwidth=1, + tickvals=tick_vals, + ticktext=tick_text, + tickfont=dict( + size=11, + color='white', + family="'Geist Mono', monospace" + ) + ), + angularaxis=dict( + showline=True, + linewidth=2, + linecolor='rgba(245, 246, 247, 0.2)', + gridcolor='rgba(245, 246, 247, 0.08)', + tickfont=dict( + size=13, + family="'Geist', sans-serif", + color='white', + weight=600 + ), + rotation=90, + direction="clockwise", + ), + ), + showlegend=True, + legend=dict( + orientation="h", + yanchor="bottom", + y=-0.15, + xanchor="center", + x=0.5, + font=dict(size=12, family="'Geist', sans-serif", color='white'), + bgcolor='rgba(1, 9, 26, 0.8)', + bordercolor='rgba(245, 246, 247, 0.2)', + borderwidth=1, + itemsizing='constant', + itemwidth=30 + ), + title=dict( + text="Core Capability Radar", + x=0.5, + y=0.97, + font=dict( + size=22, + family="'Geist', sans-serif", + color="white", + weight=700 + ), + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=800, + width=900, + margin=dict(t=30, b=50, l=10, r=10), + autosize=True, + annotations=[ + dict( + text="Ko-Agent Leaderboard", + xref="paper", yref="paper", + x=0.98, y=0.02, + xanchor='right', yanchor='bottom', + font=dict(size=10, color='#64748B'), + showarrow=False + ) + ] + ) + + return fig + + +def create_performance_heatmap(df, ordered_models=None, max_models=12): + """Render a heatmap of SR scores across task levels for selected models.""" + df = df.copy() + level_sequence = [f"L{i}" for i in range(1, 8)] + sr_columns = [] + for level in level_sequence: + col = f"{level}_SR" + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + sr_columns.append((level, col)) + if df.empty or not sr_columns: + return create_empty_heatmap("Not enough SR data to render the heatmap") + + df = df.drop_duplicates(subset=["Model"]) + if df.empty: + return create_empty_heatmap("No models available to render the heatmap") + + sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0] + df = df.sort_values(sort_column, ascending=False) + + if ordered_models: + ordered_models = [m for m in ordered_models if m in df["Model"].tolist()] + else: + ordered_models = df["Model"].tolist() + + if not ordered_models: + return create_empty_heatmap("No models available to render the heatmap") + + ordered_models = ordered_models[:max_models] + heatmap_df = df.set_index("Model").reindex(ordered_models) + + level_labels = [] + z_matrix = [] + has_values = False + + for level, col in sr_columns: + if col not in heatmap_df.columns: + continue + label = f"{level} · SR" + level_labels.append(label) + row_values = [] + for model in ordered_models: + value = heatmap_df.at[model, col] if model in heatmap_df.index else None + if pd.isna(value): + row_values.append(None) + else: + val = float(value) + row_values.append(val) + has_values = True + z_matrix.append(row_values) + + if not level_labels or not has_values: + return create_empty_heatmap("Not enough SR data to render the heatmap") + + colorscale = [ + [0.0, "#0A0A0A"], + [0.25, "#1A1411"], + [0.5, "#332818"], + [0.75, "#B8660A"], + [1.0, "#FFD21E"], + ] + + fig = go.Figure() + fig.add_trace( + go.Heatmap( + z=z_matrix, + x=ordered_models, + y=level_labels, + colorscale=colorscale, + zmin=0, + zmax=1, + hovertemplate="%{y}
%{x}
SR · %{z:.3f}", + colorbar=dict( + title="Success Rate", + titlefont=dict(color="white", family="'Geist', sans-serif", size=12), + tickfont=dict(color="white", family="'Geist', sans-serif", size=10), + thickness=12, + len=0.7, + outlinecolor="rgba(255, 255, 255, 0.1)", + bgcolor="rgba(1, 9, 26, 0.75)" + ), + showscale=True + ) + ) + + annotations = [] + for y_idx, level in enumerate(level_labels): + for x_idx, model in enumerate(ordered_models): + value = z_matrix[y_idx][x_idx] + if value is None: + continue + font_color = "#0B1120" if value >= 0.6 else "#F8FAFC" + annotations.append( + dict( + x=model, + y=level, + text=f"{value:.3f}", + showarrow=False, + font=dict( + family="'Geist Mono', monospace", + size=11, + color=font_color + ) + ) + ) + + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + margin=dict(t=80, b=90, l=110, r=160), + height=520, + width=1450, + font=dict(family="'Geist', sans-serif", color="white"), + xaxis=dict( + tickangle=-25, + showgrid=False, + ticks="", + tickfont=dict(size=11, family="'Geist', sans-serif", color="white") + ), + yaxis=dict( + showgrid=False, + ticks="", + tickfont=dict(size=12, family="'Geist', sans-serif", color="white") + ), + annotations=annotations, + title=dict( + text="Comprehensive Performance Heatmap", + x=0.5, + y=0.98, + font=dict( + size=20, + family="'Geist', sans-serif", + color="white", + weight=700 + ), + ) + ) + fig.update_xaxes(side="bottom") + + return fig + + +def create_empty_heatmap(message): + """Render an empty state for the heatmap with a centered message.""" + fig = go.Figure() + fig.add_annotation( + text=f"🗺️ {message}", + xref="paper", yref="paper", + x=0.5, y=0.5, + xanchor='center', yanchor='middle', + font=dict( + size=18, + color="white", + family="'Geist', sans-serif" + ), + showarrow=False, + bgcolor="rgba(245, 246, 247, 0.05)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + borderpad=20 + ) + + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=520, + width=1450, + margin=dict(t=80, b=80, l=80, r=160), + title=dict( + text="Comprehensive Performance Heatmap", + x=0.5, + y=0.98, + font=dict( + size=20, + family="'Geist', sans-serif", + color="white", + weight=700 + ), + ) + ) + fig.update_xaxes(visible=False) + fig.update_yaxes(visible=False) + return fig + + +def create_level_metric_chart(df, level, selected_models=None, max_models=5): + """Render a grouped horizontal bar chart showing per-model scores for a level's metrics.""" + if not level: + return create_empty_level_metric_chart("Select a level to view its metrics") + df = df.copy() + level_prefix = f"{level}_" + level_columns = [col for col in df.columns if col.startswith(level_prefix)] + metric_columns = [] + for col in level_columns: + metric_suffix = col[len(level_prefix):] + metric_key_lower = metric_suffix.lower() + if "cost" in metric_key_lower: + continue + numeric_series = pd.to_numeric(df[col], errors='coerce') + valid_values = numeric_series.dropna() + if valid_values.empty: + continue + if (valid_values < 0).any() or (valid_values > 1.05).any(): + continue + df[col] = numeric_series + metric_columns.append(col) + if not metric_columns: + return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize") + df = df.drop_duplicates(subset=['Model']) + if df.empty: + return create_empty_level_metric_chart("No models available to render level metrics") + if selected_models: + model_order = [m for m in selected_models if m in df['Model'].tolist()] + else: + sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0] + model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist() + if not model_order: + model_order = df['Model'].tolist() + model_order = model_order[:max_models] + df_models = df[df['Model'].isin(model_order)].set_index('Model') + if df_models.empty: + return create_empty_level_metric_chart("No matching models for selected filters") + def prettify_metric_name(metric_key): + raw = metric_key[len(level_prefix):] + text = raw.replace('_', ' ') + text = re.sub(r'(?<=.)([A-Z])', r' \1', text) + text = text.replace('Avg', 'Average') + replacements = { + 'Sr': 'SR', + 'Ac': 'AC', + 'Tsq': 'TSQ', + 'Cvr': 'CVR', + 'Psm': 'PSM', + 'Prov': 'Prov', + 'Call Em': 'CallEM', + 'Reuse Rate': 'Reuse Rate', + 'Eff Score': 'Eff Score' + } + words = text.title().split() + words = [replacements.get(word, word) for word in words] + return ' '.join(words) + metric_labels = [] + for col in metric_columns: + label = prettify_metric_name(col) + if label in metric_labels: + suffix = 2 + while f"{label} ({suffix})" in metric_labels: + suffix += 1 + label = f"{label} ({suffix})" + metric_labels.append(label) + model_palette = [ + '#ffd21e', + '#FF8A3C', + '#F970B9', + '#8B5CF6', + '#F8FAFC', + '#38BDF8', + ] + fig = go.Figure() + max_value = 0 + for idx, model in enumerate(model_order): + values = [] + for col in metric_columns: + value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan') + if pd.notna(value): + values.append(float(value)) + max_value = max(max_value, float(value)) + else: + values.append(None) + color = model_palette[idx % len(model_palette)] + fig.add_trace( + go.Bar( + name=model, + y=metric_labels, + x=values, + orientation='h', + marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)), + hovertemplate="%{y}
Model · %{fullData.name}
Score · %{x:.3f}", + ) + ) + plot_height = max(360, 140 + 48 * len(metric_labels)) + if max_value <= 0: + x_range = [0, 1] + else: + x_range = [0, max_value * 1.05] + fig.update_layout( + barmode='group', + bargap=0.25, + bargroupgap=0.18, + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=plot_height, + width=1450, + margin=dict(t=90, b=80, l=220, r=160), + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1, + bgcolor='rgba(1, 9, 26, 0.75)', + bordercolor='rgba(245, 246, 247, 0.2)', + borderwidth=1, + font=dict(size=11, family="'Geist', sans-serif", color='white') + ), + xaxis=dict( + title=dict(text=f"{level} Metric Score", font=dict(size=14, color="white")), + tickfont=dict(size=11, color="white"), + gridcolor='rgba(245, 246, 247, 0.08)', + zerolinecolor='rgba(245, 246, 247, 0.18)', + range=x_range + ), + yaxis=dict( + tickfont=dict(size=13, color="white"), + automargin=True + ), + title=dict( + text=f"{level} Metric Breakdown", + x=0.5, + y=0.98, + font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700) + ) + ) + return fig + + +def create_empty_level_metric_chart(message): + fig = go.Figure() + fig.add_annotation( + text=f"🧭 {message}", + xref="paper", yref="paper", + x=0.5, y=0.5, + xanchor='center', yanchor='middle', + font=dict(size=18, color="white", family="'Geist', sans-serif"), + showarrow=False, + bgcolor="rgba(245, 246, 247, 0.05)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + borderpad=20 + ) + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=420, + width=1450, + margin=dict(t=80, b=60, l=80, r=120), + title=dict( + text="Level Metric Breakdown", + x=0.5, + y=0.98, + font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700) + ) + ) + fig.update_xaxes(visible=False) + fig.update_yaxes(visible=False) + return fig + + +def create_empty_radar_chart(message): + """Create an empty radar chart with a message""" + fig = go.Figure() + + fig.add_annotation( + text=f"📊 {message}", + xref="paper", yref="paper", + x=0.5, y=0.5, + xanchor='center', yanchor='middle', + font=dict( + size=18, + color="white", + family="'Geist', sans-serif" + ), + showarrow=False, + bgcolor="rgba(245, 246, 247, 0.05)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + borderpad=20 + ) + + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=1450, + width=1450, + margin=dict(t=100, b=80, l=80, r=200), + title=dict( + text="Core Capability Radar", + x=0.5, + y=0.97, + font=dict( + size=22, + family="'Geist', sans-serif", + color="white", + weight=700 + ), + ), + annotations=[ + dict( + xref="paper", yref="paper", + x=0.98, y=0.02, + xanchor='right', yanchor='bottom', + font=dict(size=10, color='#64748B'), + showarrow=False + ) + ] + ) + + return fig + + +# NEW VISUALIZATION FUNCTIONS + +def create_cost_performance_scatter(df, metric="Avg AC"): + """Create scatter plot showing cost vs performance efficiency""" + # Filter out models without cost or performance data + df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy() + label_map = { + 'Proprietary': 'API', + 'Open source': 'OSS' + } + + if df_filtered.empty: + return create_empty_chart("No data available for cost-performance analysis") + + # Convert to numeric + df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') + df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') + df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce') + + # Create color mapping for model type + color_map = { + 'Proprietary': '#1098F7', # Airglow Blue for Proprietary + 'Open source': '#58BC82' # Green for Open source + } + df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7') + + fig = go.Figure() + + # Add scatter points + for model_type in df_filtered['Model Type'].unique(): + df_type = df_filtered[df_filtered['Model Type'] == model_type] + legend_name = label_map.get(model_type, model_type) + + fig.add_trace(go.Scatter( + x=df_type[metric], + y=df_type['Avg Total Cost'], + mode='markers+text', + name=legend_name, + text=df_type['Model'], + textposition="top center", + textfont=dict(size=10, color='white'), + marker=dict( + size=df_type['Avg Turns'] * 3, # Size based on number of turns + color=color_map.get(model_type, '#F5F6F7'), + opacity=0.8, + line=dict(width=2, color='#01091A') + ), + hovertemplate="%{text}
" + + f"{metric}: %{{x:.3f}}
" + + "Cost: $%{y:.3f}
" + + "Turns: %{marker.size:.1f}
" + + "" + )) + + # Add quadrant lines + median_x = df_filtered[metric].median() + median_y = df_filtered['Avg Total Cost'].median() + + fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) + fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) + + # Add quadrant labels + fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance
Low Cost", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)") + fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance
High Cost", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)") + + metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Cost-Performance Efficiency: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) + ), + xaxis=dict( + title=dict( + text=f"{metric_display}", + font=dict(size=16, color="white") + ), + tickfont=dict(size=12, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + yaxis=dict( + title=dict( + text="Average Session Cost ($)", + font=dict(size=16, color="white") + ), + tickfont=dict(size=12, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=900, + width=1450, + showlegend=True, + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1, + font=dict(size=12, family="'Geist', sans-serif", color='white'), + bgcolor='rgba(1, 9, 26, 0.8)', + bordercolor='rgba(245, 246, 247, 0.2)', + borderwidth=1 + ), + margin=dict(t=100, b=80, l=80, r=80) + ) + + return fig + + +def create_speed_accuracy_plot(df, metric="Avg AC"): + """Create scatter plot showing speed vs accuracy trade-off""" + # Filter out models without duration or performance data + df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy() + + if df_filtered.empty: + return create_empty_chart("No data available for speed-accuracy analysis") + + # Convert to numeric + df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce') + df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') + + # Create color scale based on cost + df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') + + fig = go.Figure() + + # Add scatter trace + fig.add_trace(go.Scatter( + x=df_filtered[metric], + y=df_filtered['Avg Session Duration'], + mode='markers+text', + text=df_filtered['Model'], + textposition="top center", + textfont=dict(size=9, color='white'), + marker=dict( + size=12, + color=df_filtered['Avg Total Cost'], + colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']], + showscale=True, + colorbar=dict( + title=dict( + text="Cost ($)", + font=dict(color="white") + ), + tickfont=dict(color="white"), + bgcolor="rgba(1, 9, 26, 0.8)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + x=1.02 + ), + line=dict(width=2, color='#01091A') + ), + hovertemplate="%{text}
" + + f"{metric}: %{{x:.3f}}
" + + "Duration: %{y:.1f}s
" + + "Cost: $%{marker.color:.3f}
" + + "" + )) + + # Add quadrant lines + median_x = df_filtered[metric].median() + median_y = df_filtered['Avg Session Duration'].median() + + fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) + fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) + + # Add quadrant labels + fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="white", weight=600)) + fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="#ffd21e", weight=600)) + + metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Speed vs Accuracy Trade-off: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) + ), + xaxis=dict( + title=dict( + text=f"{metric_display}", + font=dict(size=16, color="white") + ), + tickfont=dict(size=12, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + yaxis=dict( + title=dict( + text="Average Session Duration (seconds)", + font=dict(size=16, color="white") + ), + tickfont=dict(size=12, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=900, + width=1450, + margin=dict(t=100, b=80, l=80, r=120) + ) + + return fig + + +def create_domain_specialization_matrix(df, metric_type="AC"): + """Create bubble chart showing domain specialization""" + domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] + + # Prepare data + data = [] + for _, model in df.iterrows(): + if model['Model'] == '': + continue + + model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce') + if pd.isna(model_avg): + continue + + for domain in domains: + domain_col = f'{domain} {metric_type}' + if domain_col in model and model[domain_col] != '': + domain_val = pd.to_numeric(model[domain_col], errors='coerce') + if not pd.isna(domain_val): + # Calculate specialization strength (deviation from model average) + specialization = domain_val - model_avg + data.append({ + 'Model': model['Model'], + 'Domain': domain, + 'Performance': domain_val, + 'Specialization': specialization, + 'Model Type': model['Model Type'] + }) + + if not data: + return create_empty_chart("No domain specialization data available") + + df_plot = pd.DataFrame(data) + + # Create bubble chart + fig = go.Figure() + + # Color based on specialization strength + fig.add_trace(go.Scatter( + x=df_plot['Domain'], + y=df_plot['Model'], + mode='markers', + marker=dict( + size=df_plot['Performance'] * 30, # Size based on absolute performance + color=df_plot['Specialization'], + colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']], + showscale=True, + colorbar=dict( + title=dict( + text="Specialization
Strength", + font=dict(color="white") + ), + tickfont=dict(color="white"), + bgcolor="rgba(1, 9, 26, 0.8)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1 + ), + line=dict(width=2, color='#01091A'), + opacity=0.8 + ), + text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}" + for p, s in zip(df_plot['Performance'], df_plot['Specialization'])], + hovertemplate="%{y}
" + + "Domain: %{x}
" + + "%{text}
" + + "" + )) + + metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Domain Specialization Matrix: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) + ), + xaxis=dict( + title=dict( + text="Business Domains", + font=dict(size=16, color="white") + ), + tickfont=dict(size=13, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)" + ), + yaxis=dict( + title=dict( + text="Models", + font=dict(size=16, color="white") + ), + tickfont=dict(size=11, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=1100, + width=1450, + margin=dict(t=100, b=80, l=220, r=120) + ) + + return fig + + +def create_performance_gap_analysis(df, metric_type="AC"): + """Create range plot showing performance gaps by domain""" + domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] + + # Calculate min, max, median for each domain + gap_data = [] + for domain in domains: + domain_col = f'{domain} {metric_type}' + if domain_col in df.columns: + domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna() + if len(domain_values) > 0: + gap_data.append({ + 'Domain': domain, + 'Min': domain_values.min(), + 'Max': domain_values.max(), + 'Median': domain_values.median(), + 'Q1': domain_values.quantile(0.25), + 'Q3': domain_values.quantile(0.75), + 'Gap': domain_values.max() - domain_values.min() + }) + + if not gap_data: + return create_empty_chart("No data available for gap analysis") + + df_gap = pd.DataFrame(gap_data) + df_gap = df_gap.sort_values('Gap', ascending=True) + + fig = go.Figure() + + # Add range bars + for idx, row in df_gap.iterrows(): + # Add full range line + fig.add_trace(go.Scatter( + x=[row['Min'], row['Max']], + y=[row['Domain'], row['Domain']], + mode='lines', + line=dict(color='#64748B', width=2), + showlegend=False, + hoverinfo='skip' + )) + + # Add IQR box + fig.add_trace(go.Scatter( + x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']], + y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']], + fill='toself', + fillcolor='rgba(255, 210, 30, 0.3)', + line=dict(color='#ffd21e', width=2), + showlegend=False, + hoverinfo='skip', + mode='lines' + )) + + # Add median marker + fig.add_trace(go.Scatter( + x=[row['Median']], + y=[row['Domain']], + mode='markers', + marker=dict( + size=12, + color='#ffd21e', + symbol='diamond', + line=dict(width=2, color='#01091A') + ), + showlegend=False, + hovertemplate=f"{row['Domain']}
" + + f"Min: {row['Min']:.3f}
" + + f"Q1: {row['Q1']:.3f}
" + + f"Median: {row['Median']:.3f}
" + + f"Q3: {row['Q3']:.3f}
" + + f"Max: {row['Max']:.3f}
" + + f"Gap: {row['Gap']:.3f}
" + + "" + )) + + # Add min/max points + for idx, row in df_gap.iterrows(): + fig.add_trace(go.Scatter( + x=[row['Min'], row['Max']], + y=[row['Domain'], row['Domain']], + mode='markers', + marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')), + showlegend=False, + hoverinfo='skip' + )) + + metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Performance Gap Analysis by Domain: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) + ), + xaxis=dict( + title=dict( + text=f"{metric_display} Score", + font=dict(size=16, color="white") + ), + tickfont=dict(size=12, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)", + range=[0, 1] if metric_type in ['AC', 'TSQ'] else None + ), + yaxis=dict( + title=dict( + text="Business Domain", + font=dict(size=16, color="white") + ), + tickfont=dict(size=13, color="white"), + gridcolor="rgba(245, 246, 247, 0.1)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=800, + width=1450, + margin=dict(t=100, b=80, l=140, r=80), + showlegend=False + ) + + # Add legend manually + fig.add_annotation( + text="◆ Median ━ IQR ─ Full Range", + xref="paper", yref="paper", + x=0.98, y=0.02, + xanchor='right', yanchor='bottom', + font=dict(size=12, color='white'), + showarrow=False + ) + + return fig + + +def create_empty_chart(message): + """Create an empty chart with a message""" + fig = go.Figure() + + fig.add_annotation( + text=f"📊 {message}", + xref="paper", yref="paper", + x=0.5, y=0.5, + xanchor='center', yanchor='middle', + font=dict( + size=18, + color="white", + family="'Geist', sans-serif" + ), + showarrow=False, + bgcolor="rgba(245, 246, 247, 0.05)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + borderpad=20 + ) + + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=700, + width=1450, + margin=dict(t=80, b=80, l=80, r=80) + )