diff --git "a/tabs/leaderboard_v1_en.py" "b/tabs/leaderboard_v1_en.py"
new file mode 100644--- /dev/null
+++ "b/tabs/leaderboard_v1_en.py"
@@ -0,0 +1,4114 @@
+"""
+Agent Leaderboard v1 - Main leaderboard interface
+Updated implementation with LLM Type support and optimized radar charts
+"""
+
+import base64
+import math
+import re
+from datetime import datetime
+from pathlib import Path
+
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+
+# Import components and styles from modular files
+from components.leaderboard_components import (
+ get_chart_colors, get_rank_badge, get_type_badge,
+ get_metric_tooltip, get_responsive_styles, get_faq_section
+)
+from styles.leaderboard_styles import get_leaderboard_css
+
+ASSET_ICON_PATH = Path("krew_icon.png")
+KREW_ICON_BASE64 = ""
+if ASSET_ICON_PATH.exists():
+ KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8")
+
+CSV_PATH = Path("combined_evaluation_summary.csv")
+if CSV_PATH.exists():
+ EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d")
+else:
+ EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d")
+
+
+def create_leaderboard_v2_tab():
+ """Create the main leaderboard v1 tab with interactive table"""
+ token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens)
+ tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling
+ level_ids = [f"L{i}" for i in range(1, 8)]
+ level_tsq_sources = {
+ "L1": "L1_ArgAcc",
+ "L2": "L2_SelectAcc",
+ "L3": "L3_PSM",
+ "L4": "L4_Coverage",
+ "L5": "L5_AdaptiveRoutingScore",
+ "L6": "L6_EffScore",
+ "L7": "L7_ContextRetention",
+ }
+
+ def load_leaderboard_data():
+ """Load and prepare the leaderboard data"""
+ df = pd.read_csv('combined_evaluation_summary.csv')
+
+ # Clean and prepare data
+ df = df.copy()
+ numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')]
+ for col in numeric_candidate_cols:
+ df[col] = pd.to_numeric(df[col], errors='coerce')
+
+ # Derive per-level helper columns for cost and turns
+ sr_columns = []
+ tsq_columns = []
+ duration_columns = []
+ cost_columns = []
+ turns_columns = []
+
+ for level in level_ids:
+ sr_col = f"{level}_SR"
+ if sr_col in df.columns:
+ sr_columns.append(sr_col)
+ df[sr_col] = df[sr_col].round(3)
+
+ tsq_source = level_tsq_sources.get(level)
+ if tsq_source and tsq_source in df.columns:
+ tsq_columns.append(tsq_source)
+
+ duration_col = f"{level}_Avg_Exec_Time"
+ if duration_col in df.columns:
+ duration_columns.append(duration_col)
+
+ token_col = f"{level}_Avg_Tokens"
+ if token_col in df.columns:
+ cost_col = f"{level}_Avg_Cost"
+ turns_col = f"{level}_Avg_Turns"
+ df[cost_col] = df[token_col] * token_to_cost_factor
+ df[turns_col] = df[token_col] / tokens_per_turn
+ cost_columns.append(cost_col)
+ turns_columns.append(turns_col)
+
+ if sr_columns:
+ df['Avg AC'] = df[sr_columns].mean(axis=1)
+ if tsq_columns:
+ df['Avg TSQ'] = df[tsq_columns].mean(axis=1)
+ if cost_columns:
+ df['Avg Total Cost'] = df[cost_columns].mean(axis=1)
+ if duration_columns:
+ df['Avg Session Duration'] = df[duration_columns].mean(axis=1)
+ if turns_columns:
+ df['Avg Turns'] = df[turns_columns].mean(axis=1)
+
+ # Derive core capability metrics for radar visualization
+ if sr_columns:
+ df['Overall Success'] = df[sr_columns].mean(axis=1)
+ execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns]
+ if execution_cols:
+ df['Execution Accuracy'] = df[execution_cols].mean(axis=1)
+ reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns]
+ if reasoning_cols:
+ df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1)
+ robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns]
+ if robustness_cols:
+ df['Robustness'] = df[robustness_cols].mean(axis=1)
+ context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns]
+ if context_cols:
+ df['Context & Efficiency'] = df[context_cols].mean(axis=1)
+ epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns]
+ if epr_cols:
+ df['Call Validity'] = df[epr_cols].mean(axis=1)
+
+ # Use LLM Type from CSV directly, with mapping to display names
+ if 'LLM Type' in df.columns:
+ # Clean the LLM Type column to remove any whitespace
+ df['LLM Type'] = df['LLM Type'].astype(str).str.strip()
+
+ # Map LLM Type to Model Type
+ def map_llm_type(llm_type):
+ if llm_type.upper() == "OSS":
+ return "Open source"
+ else:
+ return "Proprietary"
+
+ df['Model Type'] = df['LLM Type'].apply(map_llm_type)
+ else:
+ # Fallback to vendor mapping if LLM Type column doesn't exist
+ vendor_model_type_map = {
+ "OpenAI": "Proprietary",
+ "Anthropic": "Proprietary",
+ "Google": "Proprietary",
+ "Microsoft": "Proprietary",
+ "Mistral": "Proprietary",
+ "Databricks": "Open source",
+ "Meta": "Open source",
+ "Alibaba": "Open source",
+ "알리바바": "Open source", # Korean name for Alibaba
+ "Kakao": "Open source",
+ "SKT": "Open source",
+ "KT": "Open source",
+ "xAI": "Proprietary",
+ }
+ df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary')
+
+ # Round numeric columns for better display
+ round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
+ 'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity']
+ round_one_cols = ['Avg Session Duration', 'Avg Turns']
+ for col in round_three_cols:
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
+ for col in round_one_cols:
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(1)
+ if cost_columns:
+ df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3)
+ if turns_columns:
+ df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2)
+ if duration_columns:
+ df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2)
+
+ # Fill NaN values appropriately
+ df = df.fillna('')
+
+ return df
+
+ def build_static_radar_chart(values, labels):
+ """Render a small static radar chart as inline SVG"""
+ if not values or all(v == 0 for v in values):
+ return """
+
+ Radar Chart
+ Execution Accuracy · Complex Reasoning · Robustness · Context & Efficiency · Overall Success · Validity
+
+ """
+ size = 220
+ center = size / 2
+ radius = size * 0.38
+ n = len(values)
+ def point(v, idx, scale=1.0):
+ angle = (2 * math.pi * idx / n) - math.pi / 2
+ r = radius * v * scale
+ x = center + r * math.cos(angle)
+ y = center + r * math.sin(angle)
+ return x, y
+ polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values)))
+ ring_polygons = []
+ for step in (0.33, 0.66, 1.0):
+ ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n)))
+ opacity = 0.04 if step < 1.0 else 0.08
+ ring_polygons.append(f' ')
+ axis_lines = "\n".join(
+ f' '
+ for idx in range(n)
+ )
+ label_spans = "\n".join(
+ f'{label} '
+ for idx, label in enumerate(labels)
+ )
+ svg = f"""
+
+
+
+
+
+
+
+
+ {''.join(ring_polygons)}
+ {axis_lines}
+
+ {label_spans}
+
+ """
+ return svg
+
+ # Level metadata for the 7-stage task framework
+ level_details = {
+ "ALL": {
+ "title": "ALL · All Tasks",
+ "description": "Compare overall performance levels and stage-specific strengths of models through average SR across L1~L7 levels."
+ },
+ "L1": {
+ "title": ">L1 · Single Tool Execution ",
+ "description": "Evaluates single tool execution capability and basic command performance accuracy. "
+ },
+ "L2": {
+ "title": "L2 · Tool Selection Capability ",
+ "description": "Measures the ability to select appropriate tools and invoke them with proper parameters. "
+ },
+ "L3": {
+ "title": "L3 · Sequential Reasoning ",
+ "description": "복수 단계의 순차적 추론을 통해 문제를 해결하는 과정을 검증합니다. "
+ },
+ "L4": {
+ "title": "L4 · Parallel Reasoning ",
+ "description": "Evaluates the ability to integrate and summarize information from multiple sources in parallel. "
+ },
+ "L5": {
+ "title": "L5 · Robustness (Robustness / Fallback) ",
+ "description": "Confirms recognition and response strategies for unexpected errors or failure situations. "
+ },
+ "L6": {
+ "title": "L6 · Efficiency (Efficiency) ",
+ "description": "Examines operational efficiency in achieving goals with minimal calls and costs. "
+ },
+ "L7": {
+ "title": "L7 · Long-term Context Memory (Contextual Memory) ",
+ "description": "Intensively analyzes the ability to maintain and appropriately utilize long-term conversation context. "
+ }
+ }
+ default_level = "ALL"
+
+ sr_column_map = {level: f"{level}_SR" for level in level_ids}
+ overall_sort_column = "Overall Success"
+
+ def resolve_level(level_value):
+ """Normalize the incoming level filter value"""
+ if not level_value:
+ return default_level
+ return level_value if level_value in level_details else default_level
+
+ def generate_html_table(filtered_df, highlight_column):
+ """Generate styled HTML table with per-level success rates"""
+ valid_highlights = list(sr_column_map.values()) + ["Overall Success"]
+ highlight_column = highlight_column if highlight_column in valid_highlights else None
+ overall_column = "Overall Success"
+ overall_highlight = (highlight_column == overall_column)
+ highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids}
+
+ table_html = """
+
+
+
+
+
+
+ Rank
+ Model
+ Vendor
+ LLM Type
+ """
+ overall_header_classes = ["numeric-cell"]
+ if overall_highlight:
+ overall_header_classes.append("highlight-header")
+ table_html += f"""
+
+ """
+ for level in level_ids:
+ header_classes = ["numeric-cell"]
+ if highlight_map.get(level):
+ header_classes.append("highlight-header")
+ table_html += f"""
+
+ """
+ table_html += """
+
+
+
+ """
+ def safe_float(value):
+ if value is None:
+ return ''
+ if isinstance(value, str) and value.strip() == '':
+ return ''
+ if pd.isna(value):
+ return ''
+ try:
+ return float(value)
+ except (TypeError, ValueError):
+ return ''
+
+ # Generate table rows
+ for idx, (_, row) in enumerate(filtered_df.iterrows()):
+ rank = idx + 1
+ table_html += f"""
+
+ {get_rank_badge(rank)}
+ {row['Model']}
+ {row['Vendor']}
+ {get_type_badge(row['Model Type'])}
+ """
+ overall_value = safe_float(row.get(overall_column, ''))
+ if overall_value != '':
+ overall_display = f'{overall_value:.3f}'
+ else:
+ overall_display = '-'
+ overall_classes = ["numeric-cell"]
+ if overall_highlight:
+ overall_classes.append("highlight-cell")
+ table_html += f'{overall_display} '
+ for level in level_ids:
+ sr_col = sr_column_map[level]
+ value = safe_float(row.get(sr_col, ''))
+ if value != '':
+ value_display = f'{value:.3f}'
+ else:
+ value_display = '-'
+ cell_classes = ["numeric-cell"]
+ if highlight_map.get(level):
+ cell_classes.append("highlight-cell")
+ table_html += f'{value_display} '
+ table_html += " "
+
+ table_html += """
+
+
+
+ """
+
+ return table_html
+
+ def update_leaderboard_title(level_filter):
+ """Update the leaderboard title based on selected level"""
+ level_key = resolve_level(level_filter)
+ level_info = level_details.get(level_key, level_details[default_level])
+ level_title = level_info["title"]
+ level_description = level_info["description"]
+
+ return f"""
+
+
+
Agent Leaderboard · {level_title}
+
{level_description}
+
+
+ """
+
+ model_type_lookup = {
+ "OSS": "Open source",
+ "API": "Proprietary"
+ }
+
+ def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"):
+ """Apply shared filters and sorting to the leaderboard dataframe."""
+ filtered_df = df.copy()
+ level_key = resolve_level(level_filter)
+ highlight_column = None
+
+ if model_type_filter != "All":
+ mapped_type = model_type_lookup.get(model_type_filter, model_type_filter)
+ filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type]
+
+ actual_sort_column = sort_by if sort_by in filtered_df.columns else None
+ if not actual_sort_column:
+ if level_key == "ALL":
+ actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None
+ else:
+ actual_sort_column = sr_column_map.get(level_key)
+
+ if level_key in sr_column_map:
+ highlight_column = sr_column_map[level_key]
+ elif level_key == "ALL" and overall_sort_column in filtered_df.columns:
+ highlight_column = overall_sort_column
+
+ if actual_sort_column and actual_sort_column in filtered_df.columns:
+ ascending = (sort_order == "Ascending")
+ filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
+
+ return filtered_df, level_key, highlight_column
+
+ def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order):
+ """Filter and sort the leaderboard data"""
+ df = load_leaderboard_data()
+
+ filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by)
+
+ # Generate HTML table
+ return generate_html_table(filtered_df, highlight_column)
+
+ # Load initial data
+ initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
+ initial_df = load_leaderboard_data() # Load raw data for model selector
+ initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
+ initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
+ initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
+ initial_level_metric_level = level_ids[0] if level_ids else None
+ initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else []
+ initial_level_model_values = initial_level_model_choices[:5]
+ initial_level_metric_chart = create_level_metric_chart(
+ initial_df,
+ initial_level_metric_level,
+ initial_level_model_values
+ ) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available")
+
+ # Load custom CSS and responsive styles
+ custom_css = get_leaderboard_css() + get_responsive_styles() + """
+
+
+
+ """
+
+ gr.HTML(custom_css)
+
+ # Header styles and navigation
+ gr.HTML("""
+
+ """)
+
+ gr.Image(
+ value="banner.png",
+ show_label=False,
+ interactive=False,
+ type="filepath",
+ elem_id="hero-banner"
+ )
+
+ gr.HTML("""
+
+
Hugging Face KREW Ko-AgentBench
+
Agent Benchmark Specialized for Korean Service Environment
+
+ """)
+
+ # Links section below title
+ gr.HTML("""
+
+ """)
+
+ # Section 1: 단계별 태스크 설계
+ gr.HTML("""
+
+
+
From simple tool calls to long-term context understanding and robustness handling,
+
we analyzed agent capabilities in 3D across 7 levels.
+
+
+
Single-Turn
+
+ 80%
+
+
+ L1: Single Tool Execution
+ L2: Tool Selection Capability
+ L3: Sequential Reasoning
+ L4: Parallel Reasoning
+ L5: Robustness
+
+
+
+
Multi-Turn
+
+ 20%
+
+
+ L6: Efficiency
+ L7: Long-term Context Memory
+
+
+
+
+ """)
+
+ # Section 2: 핵심 시나리오 구성
+ gr.HTML("""
+
+
+
+
Realistic, user-centered scenarios—such as “appointment booking” and “blog review search”—were designed
+
by integrating major domestic service APIs including Naver Maps and Kakao.
+
+
⌄
+
+ """)
+
+ # Section 3: 핵심 평가 기준
+ gr.HTML("""
+
+
+
+
+
Cache-based Iterative Evaluation
+
+ Real API Response Caching
+ Solves chronic issues of existing benchmarks such as 'external API instability and information attribute mismatch'
+ Ensures benchmark consistency and reliability
+
+
+
+
Robustness Test
+
+ Evaluates error recognition and response capability (strategy) for intentional error situations (product discontinuation)
+ Selects models that operate stably in real-world environments
+
+
+
+
Level-specific Evaluation Metrics
+
+ Evaluates problem-solving efficiency at each stage including tool selection, parameter configuration, and data processing flow
+ Quantitatively identifies model strengths and weaknesses
+
+
+
+
+ """)
+
+ # Metrics overview cards removed per updated design
+
+ # Domain filter section with enhanced styling
+ gr.HTML("""
+
+
+ """)
+
+ level_options = list(level_details.keys())
+
+ with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
+ gr.HTML("""
+
+
🧠 Select Task Level
+
Easily compare agent performance across ALL · L1~L7 stages of Ko-AgentBench.
+
+ """)
+ domain_filter = gr.Radio(
+ choices=level_options,
+ value=default_level,
+ label="",
+ interactive=True,
+ container=False,
+ elem_classes=["domain-radio"]
+ )
+
+ # Filter controls with domain styling
+ with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
+ gr.HTML("""
+
+
🔍 Filters & Sorting
+
Select model type and sorting criteria to explore results in your preferred way.
+
+ """)
+ with gr.Row(elem_classes=["filters-sorting-row"]):
+ with gr.Column(scale=1, elem_classes=["filter-group"]):
+ with gr.Row(elem_classes=["filter-group-row"]):
+ gr.HTML("
Model Access ")
+ model_type_filter = gr.Radio(
+ choices=["All", "OSS", "API"],
+ value="All",
+ label="",
+ elem_classes=["domain-radio"],
+ container=False
+ )
+ with gr.Column(scale=1, elem_classes=["filter-group"]):
+ with gr.Row(elem_classes=["filter-group-row"]):
+ gr.HTML("
Sort Order ")
+ sort_order = gr.Radio(
+ choices=["Descending", "Ascending"],
+ value="Descending",
+ label="",
+ elem_classes=["domain-radio"],
+ container=False
+ )
+
+ # Main leaderboard table with dynamic title
+ leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
+
+ leaderboard_table = gr.HTML(initial_table)
+
+ gr.HTML("""
+
+
""")
+
+ # Radar Chart Section
+ gr.HTML("""
+
+
+
Core Capability Radar
+
Core Capability Radar
+
#Execution Accuracy #Complex Reasoning #Robustness #Context & Efficiency #Overall Success #Validity
+
Analyze model performance capabilities and balance through 6 core competencies.
+
+ """)
+
+ with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
+ gr.HTML("""
+
+
🎯 Select Models for Comparison
+
Select models to compare in the radar chart.
+
+ """)
+ model_selector = gr.Dropdown(
+ choices=initial_df['Model'].tolist()[:10],
+ value=initial_df['Model'].tolist()[:5],
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ )
+
+ # Radar chart plot - wrapped in centered container
+ gr.HTML('
')
+ radar_chart = gr.Plot(
+ label="",
+ value=create_domain_radar_chart(
+ load_leaderboard_data(),
+ initial_df['Model'].tolist()[:5]
+ ),
+ elem_classes=["radar-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # Level metric breakdown section
+ gr.HTML("""
+
+
+
Level-Specific Metric Spotlight
+
Compare model scores based on unique evaluation metrics for each L1–L7 level.
+
+ """)
+
+ with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
+ gr.HTML("""
+
+
🧭 Select Task Level and Models
+
Select L1–L7 levels and models to explore detailed SR-based metrics.
+
+ """)
+ level_metric_selector = gr.Dropdown(
+ choices=level_ids,
+ value=level_ids[0] if level_ids else None,
+ multiselect=False,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["level-dropdown"]
+ )
+ level_model_selector = gr.Dropdown(
+ choices=initial_level_model_choices,
+ value=initial_level_model_values,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ )
+
+ gr.HTML('
')
+ level_metric_chart = gr.Plot(
+ label="",
+ value=initial_level_metric_chart,
+ elem_classes=["level-metric-plot", "plot-container"]
+ )
+ gr.HTML("""
+
+
+ """)
+
+ # Heatmap section
+ gr.HTML("""
+
+
+
Comprehensive Performance Heatmap
+
Explore the comprehensive performance heatmap to see SR scores across L1–L7 levels for each model at a glance.
+
+
+ """)
+ heatmap_chart = gr.Plot(
+ label="",
+ value=initial_heatmap,
+ elem_classes=["heatmap-plot", "plot-container"]
+ )
+ gr.HTML("""
+
+
+ """)
+
+ # Update functions
+ def get_optimal_sort_order(sort_by_value):
+ """Return the optimal sort order for a given metric"""
+ # Metrics where higher is better (descending)
+ descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
+
+ # Metrics where lower is better (ascending)
+ ascending_metrics = []
+
+ if sort_by_value in descending_metrics:
+ return "Descending"
+ elif sort_by_value in ascending_metrics:
+ return "Ascending"
+ else:
+ return "Descending" # Default fallback
+
+ def update_table(level_filter, model_type_filter, sort_order):
+ title_html = update_leaderboard_title(level_filter)
+ sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
+ table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
+ return title_html, table_html
+
+ def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+ # Get filtered dataframe
+ df = load_leaderboard_data()
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+
+ # Update model selector choices based on filtered data
+ available_models_all = filtered_df['Model'].tolist()
+ available_models = available_models_all[:15] # Top 15 from filtered results
+
+ # If selected models are not in available models, reset to top 5
+ if selected_models:
+ valid_selected = [m for m in selected_models if m in available_models]
+ if not valid_selected:
+ valid_selected = available_models[:5]
+ else:
+ valid_selected = available_models[:5]
+
+ # Create radar chart
+ chart = create_domain_radar_chart(filtered_df, valid_selected)
+
+ # Prepare heatmap order prioritizing selected models
+ heatmap_order = []
+ for model in valid_selected:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ for model in available_models_all:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ heatmap_order = heatmap_order[:12]
+ heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order)
+
+ # Level metric chart
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+ available_level_models = available_models_all
+ if level_selected_models:
+ valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
+ if not valid_level_models:
+ valid_level_models = available_level_models[:5]
+ else:
+ valid_level_models = available_level_models[:5]
+ level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+
+ return (
+ gr.Dropdown(
+ choices=available_models,
+ value=valid_selected,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ ),
+ chart,
+ heatmap_fig,
+ gr.Dropdown(
+ choices=available_level_models,
+ value=valid_level_models,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ ),
+ level_metric_fig,
+ )
+
+ def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+ # Get filtered dataframe
+ df = load_leaderboard_data()
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+
+ available_models_all = filtered_df['Model'].tolist()
+ if selected_models:
+ valid_selected = [m for m in selected_models if m in available_models_all]
+ if not valid_selected:
+ valid_selected = available_models_all[:5]
+ else:
+ valid_selected = available_models_all[:5]
+
+ heatmap_order = []
+ for model in valid_selected:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ for model in available_models_all:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ heatmap_order = heatmap_order[:12]
+
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+ available_level_models = available_models_all
+ if level_selected_models:
+ valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
+ if not valid_level_models:
+ valid_level_models = available_level_models[:5]
+ else:
+ valid_level_models = available_level_models[:5]
+ level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+
+ return (
+ create_domain_radar_chart(filtered_df, valid_selected),
+ create_performance_heatmap(filtered_df, heatmap_order),
+ gr.Dropdown(
+ choices=available_level_models,
+ value=valid_level_models,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ ),
+ level_metric_fig,
+ )
+
+ def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+ df = load_leaderboard_data()
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+ available_models = filtered_df['Model'].tolist()
+ if level_selected_models:
+ valid_level_models = [m for m in level_selected_models if m in available_models][:5]
+ if not valid_level_models:
+ valid_level_models = available_models[:5]
+ else:
+ valid_level_models = available_models[:5]
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+ level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+ return (
+ gr.Dropdown(
+ choices=available_models,
+ value=valid_level_models,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ ),
+ level_chart,
+ )
+
+ # Update table when filters change
+ filter_inputs = [domain_filter, model_type_filter, sort_order]
+
+ for input_component in filter_inputs:
+ input_component.change(
+ fn=update_table,
+ inputs=filter_inputs,
+ outputs=[leaderboard_title, leaderboard_table]
+ )
+
+ # Also update radar chart when filters change
+ input_component.change(
+ fn=update_radar_chart,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
+ )
+
+ # Update radar chart when model selection changes
+ model_selector.change(
+ fn=update_radar_only,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
+ )
+
+ level_metric_selector.change(
+ fn=update_level_metric_only,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[level_model_selector, level_metric_chart]
+ )
+
+ level_model_selector.change(
+ fn=update_level_metric_only,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[level_model_selector, level_metric_chart]
+ )
+
+ # Define generate_performance_card function before using it
+ def generate_performance_card(model_name):
+ """Generate HTML for the model performance card"""
+ if not model_name:
+ return """
+ Please select a model to generate its performance card
+
"""
+
+ # Get model data
+ df = load_leaderboard_data()
+ model_data = df[df['Model'] == model_name]
+
+ if model_data.empty:
+ return """
+ Model not found in the database
+
"""
+
+ row = model_data.iloc[0]
+
+ # Get overall rank based on overall success
+ df_with_success = df.copy()
+ df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
+ df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
+ df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
+ try:
+ rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
+ except:
+ rank = 'N/A'
+
+ # Format values
+ def format_value(val, decimals=3, prefix='', suffix=''):
+ if pd.isna(val) or val == '':
+ return 'N/A'
+ return f"{prefix}{float(val):.{decimals}f}{suffix}"
+
+ def format_score(value):
+ if pd.isna(value) or value == '':
+ return 'N/A'
+ return f"{float(value):.3f}"
+
+ # Use the same order as the domain radar but keep '견고성' (Robustness) last
+ radar_metrics = [
+ ("Execution Accuracy", row.get('Execution Accuracy')),
+ ("Context & Efficiency", row.get('Context & Efficiency')),
+ ("Overall Success", row.get('Overall Success')),
+ ("Robustness", row.get('Robustness')),
+ ("Complex Reasoning", row.get('Complex Reasoning')),
+ ("Validity", row.get('Call Validity')),
+ ]
+ radar_values = []
+ radar_labels = []
+ for label, value in radar_metrics:
+ if pd.isna(value) or value == '':
+ radar_values.append(0.0)
+ else:
+ try:
+ radar_values.append(max(0.0, min(1.0, float(value))))
+ except (TypeError, ValueError):
+ radar_values.append(0.0)
+ radar_labels.append(label)
+
+ mini_radar_html = build_static_radar_chart(radar_values, radar_labels)
+
+ level_blocks = []
+ for level in level_ids:
+ sr_col = sr_column_map.get(level)
+ level_blocks.append((level, row.get(sr_col, '')))
+
+ evaluation_date = EVALUATION_DATE
+ icon_html = ""
+ if KREW_ICON_BASE64:
+ icon_html = f' '
+ else:
+ icon_html = '🤖
'
+
+ card_html = f"""
+
+ """
+
+ return card_html
+
+ # MODEL PERFORMANCE CARD SECTION
+ gr.HTML("""
+
+
+
Model Performance Card
+
+ Check out the precision analysis card that visualizes the model's performance spectrum with 6 key metrics and overall success rate (SR) by L1~L7 levels.
+
+
+ ※ Rank is calculated based on the average SR value across L1–L7 levels.
+
+
+
+ """)
+
+ with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
+ gr.HTML("""
+
+
🤖 Select Model
+
Select models for the analysis card.
+
+ """)
+ card_model_selector = gr.Dropdown(
+ choices=initial_df['Model'].tolist(),
+ value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ )
+ download_card_btn = gr.Button(
+ "Download as PNG",
+ elem_id="download-card-btn",
+ elem_classes=["pill-button"]
+ )
+
+ gr.HTML("""
+
+ """)
+
+ # Card display area - generate initial card
+ initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
+ initial_card_html = generate_performance_card(initial_model) if initial_model else ""
+ card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html")
+
+ gr.HTML("""
+
+
+
+ """)
+
+ # Add custom CSS for the performance card
+ gr.HTML("""
+
+
+ """)
+
+
+ # Wire up the card generator to selection change
+ card_model_selector.change(
+ fn=generate_performance_card,
+ inputs=[card_model_selector],
+ outputs=[card_display]
+ )
+
+ # Wire up download button with html2canvas capture
+ download_card_btn.click(
+ fn=None,
+ js="""
+ async () => {
+ const ensureHtml2Canvas = () => new Promise((resolve, reject) => {
+ if (window.html2canvas) {
+ resolve(window.html2canvas);
+ return;
+ }
+ const existing = document.querySelector('script[data-html2canvas]');
+ if (existing) {
+ existing.addEventListener('load', () => resolve(window.html2canvas));
+ existing.addEventListener('error', reject);
+ return;
+ }
+ const script = document.createElement('script');
+ script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js';
+ script.async = true;
+ script.dataset.html2canvas = 'true';
+ script.onload = () => resolve(window.html2canvas);
+ script.onerror = () => reject(new Error('Failed to load html2canvas'));
+ document.head.appendChild(script);
+ });
+
+ const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms));
+ await pause(60);
+
+ const card = document.querySelector('.performance-card');
+ if (!card) {
+ alert('Performance card not found. Please select a model first.');
+ return;
+ }
+
+ const btn = document.getElementById('download-card-btn');
+ const originalText = btn?.textContent || '';
+ if (btn) {
+ btn.textContent = 'Generating...';
+ btn.disabled = true;
+ }
+
+ try {
+ const html2canvasLib = await ensureHtml2Canvas();
+ if (!html2canvasLib) {
+ throw new Error('html2canvas unavailable');
+ }
+
+ const canvas = await html2canvasLib(card, {
+ backgroundColor: '#01091A',
+ scale: 2,
+ logging: false,
+ useCORS: true
+ });
+
+ const link = document.createElement('a');
+ const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
+ const timestamp = new Date().toISOString().slice(0, 10);
+ const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
+ link.download = fileName;
+ link.href = canvas.toDataURL('image/png');
+ document.body.appendChild(link);
+ link.click();
+ document.body.removeChild(link);
+ } catch (error) {
+ console.error('Error capturing card:', error);
+ alert('Failed to capture performance card. Please try again.');
+ } finally {
+ if (btn) {
+ btn.textContent = originalText;
+ btn.disabled = false;
+ }
+ }
+ }
+ """
+ )
+
+ # Also update card when filters change to keep model selector in sync
+ for input_component in filter_inputs:
+ def update_dropdown_and_card(*args):
+ filtered_df, _, _ = apply_filters(
+ load_leaderboard_data(),
+ args[0],
+ args[1],
+ args[2],
+ "Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success")
+ )
+ choices = filtered_df['Model'].tolist()
+ # Select first model from filtered list
+ value = choices[0] if choices else None
+ return gr.Dropdown(
+ choices=choices,
+ value=value,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ )
+
+ input_component.change(
+ fn=update_dropdown_and_card,
+ inputs=filter_inputs,
+ outputs=[card_model_selector]
+ )
+
+ return leaderboard_table
+
+
+def create_leaderboard_v2_interface():
+ """Create the complete leaderboard v1 interface"""
+ return create_leaderboard_v2_tab()
+
+
+def create_domain_radar_chart(df, selected_models=None, max_models=5):
+ """Visualize six core capability metrics on a radar chart."""
+ df = df.copy()
+ # Use the same metric order and Korean labels as the model performance card
+ # Match the model card order but place '견고성' (Robustness) last as requested
+ metrics_info = [
+ {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
+ {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
+ {"column": "Overall Success", "label": "Overall Success", "description": "L1~L7의 Average Success Rate"},
+ {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
+ {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
+ {"column": "Call Validity", "label": "Validity", "description": "레벨별 EPR_CVR 평균"},
+ ]
+
+ required_columns = [m["column"] for m in metrics_info]
+ if df.empty or not any(col in df.columns for col in required_columns):
+ return create_empty_radar_chart("Not enough data to build the capability radar")
+
+ # Default model selection
+ if not selected_models:
+ if "Overall Success" in df.columns:
+ top_models = df.sort_values("Overall Success", ascending=False)
+ else:
+ top_models = df
+ selected_models = top_models['Model'].head(max_models).tolist()
+
+ selected_models = selected_models[:max_models]
+
+ # Ensure metric columns are numeric
+ for metric in metrics_info:
+ col = metric["column"]
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors='coerce')
+
+ fig = go.Figure()
+ angle_labels = [m["label"] for m in metrics_info]
+
+ palette = [
+ {'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
+ {'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
+ {'fill': 'rgba(249, 112, 185, 0.22)', 'line': '#F970B9'},
+ {'fill': 'rgba(139, 92, 246, 0.20)', 'line': '#8B5CF6'},
+ {'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
+ ]
+
+ for idx, model_name in enumerate(selected_models):
+ model_data = df[df['Model'] == model_name]
+ if model_data.empty:
+ continue
+
+ row = model_data.iloc[0]
+ values = []
+ tooltips = []
+ for metric in metrics_info:
+ col = metric["column"]
+ value = row[col] if col in row else float('nan')
+ if pd.isna(value) or value == '':
+ value = 0
+ values.append(float(value))
+ tooltips.append(metric["description"])
+
+ if not values:
+ continue
+
+ values_loop = values + [values[0]]
+ angles_loop = angle_labels + [angle_labels[0]]
+ tooltips_loop = tooltips + [tooltips[0]]
+ colors = palette[idx % len(palette)]
+
+ fig.add_trace(
+ go.Scatterpolar(
+ r=values_loop,
+ theta=angles_loop,
+ fill='toself',
+ fillcolor=colors['fill'],
+ line=dict(color=colors['line'], width=3),
+ marker=dict(
+ size=10,
+ color=colors['line'],
+ symbol='circle',
+ line=dict(width=2, color='#01091A')
+ ),
+ name=model_name,
+ customdata=tooltips_loop,
+ mode="lines+markers",
+ hovertemplate="%{fullData.name} " +
+ "%{theta} " +
+ "%{customdata} " +
+ "%{r:.3f} " +
+ " ",
+ hoverlabel=dict(
+ bgcolor="rgba(1, 9, 26, 0.95)",
+ bordercolor=colors['line'],
+ font=dict(color="white", size=12, family="'Geist', sans-serif")
+ )
+ )
+ )
+
+ tick_vals = [i / 5 for i in range(6)]
+ tick_text = [f"{val:.2f}" for val in tick_vals]
+
+ fig.update_layout(
+ polar=dict(
+ bgcolor='rgba(245, 246, 247, 0.03)',
+ radialaxis=dict(
+ visible=True,
+ range=[0, 1],
+ showline=True,
+ linewidth=2,
+ linecolor='rgba(245, 246, 247, 0.2)',
+ gridcolor='rgba(245, 246, 247, 0.1)',
+ gridwidth=1,
+ tickvals=tick_vals,
+ ticktext=tick_text,
+ tickfont=dict(
+ size=11,
+ color='white',
+ family="'Geist Mono', monospace"
+ )
+ ),
+ angularaxis=dict(
+ showline=True,
+ linewidth=2,
+ linecolor='rgba(245, 246, 247, 0.2)',
+ gridcolor='rgba(245, 246, 247, 0.08)',
+ tickfont=dict(
+ size=13,
+ family="'Geist', sans-serif",
+ color='white',
+ weight=600
+ ),
+ rotation=90,
+ direction="clockwise",
+ ),
+ ),
+ showlegend=True,
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=-0.15,
+ xanchor="center",
+ x=0.5,
+ font=dict(size=12, family="'Geist', sans-serif", color='white'),
+ bgcolor='rgba(1, 9, 26, 0.8)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1,
+ itemsizing='constant',
+ itemwidth=30
+ ),
+ title=dict(
+ text="Core Capability Radar ",
+ x=0.5,
+ y=0.97,
+ font=dict(
+ size=22,
+ family="'Geist', sans-serif",
+ color="white",
+ weight=700
+ ),
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=900,
+ margin=dict(t=30, b=50, l=10, r=10),
+ autosize=True,
+ annotations=[
+ dict(
+ text="Ko-Agent Leaderboard",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=10, color='#64748B'),
+ showarrow=False
+ )
+ ]
+ )
+
+ return fig
+
+
+def create_performance_heatmap(df, ordered_models=None, max_models=12):
+ """Render a heatmap of SR scores across task levels for selected models."""
+ df = df.copy()
+ level_sequence = [f"L{i}" for i in range(1, 8)]
+ sr_columns = []
+ for level in level_sequence:
+ col = f"{level}_SR"
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors="coerce")
+ sr_columns.append((level, col))
+ if df.empty or not sr_columns:
+ return create_empty_heatmap("Not enough SR data to render the heatmap")
+
+ df = df.drop_duplicates(subset=["Model"])
+ if df.empty:
+ return create_empty_heatmap("No models available to render the heatmap")
+
+ sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0]
+ df = df.sort_values(sort_column, ascending=False)
+
+ if ordered_models:
+ ordered_models = [m for m in ordered_models if m in df["Model"].tolist()]
+ else:
+ ordered_models = df["Model"].tolist()
+
+ if not ordered_models:
+ return create_empty_heatmap("No models available to render the heatmap")
+
+ ordered_models = ordered_models[:max_models]
+ heatmap_df = df.set_index("Model").reindex(ordered_models)
+
+ level_labels = []
+ z_matrix = []
+ has_values = False
+
+ for level, col in sr_columns:
+ if col not in heatmap_df.columns:
+ continue
+ label = f"{level} · SR"
+ level_labels.append(label)
+ row_values = []
+ for model in ordered_models:
+ value = heatmap_df.at[model, col] if model in heatmap_df.index else None
+ if pd.isna(value):
+ row_values.append(None)
+ else:
+ val = float(value)
+ row_values.append(val)
+ has_values = True
+ z_matrix.append(row_values)
+
+ if not level_labels or not has_values:
+ return create_empty_heatmap("Not enough SR data to render the heatmap")
+
+ colorscale = [
+ [0.0, "#0A0A0A"],
+ [0.25, "#1A1411"],
+ [0.5, "#332818"],
+ [0.75, "#B8660A"],
+ [1.0, "#FFD21E"],
+ ]
+
+ fig = go.Figure()
+ fig.add_trace(
+ go.Heatmap(
+ z=z_matrix,
+ x=ordered_models,
+ y=level_labels,
+ colorscale=colorscale,
+ zmin=0,
+ zmax=1,
+ hovertemplate="%{y} %{x} SR · %{z:.3f} ",
+ colorbar=dict(
+ title="Success Rate",
+ titlefont=dict(color="white", family="'Geist', sans-serif", size=12),
+ tickfont=dict(color="white", family="'Geist', sans-serif", size=10),
+ thickness=12,
+ len=0.7,
+ outlinecolor="rgba(255, 255, 255, 0.1)",
+ bgcolor="rgba(1, 9, 26, 0.75)"
+ ),
+ showscale=True
+ )
+ )
+
+ annotations = []
+ for y_idx, level in enumerate(level_labels):
+ for x_idx, model in enumerate(ordered_models):
+ value = z_matrix[y_idx][x_idx]
+ if value is None:
+ continue
+ font_color = "#0B1120" if value >= 0.6 else "#F8FAFC"
+ annotations.append(
+ dict(
+ x=model,
+ y=level,
+ text=f"{value:.3f}",
+ showarrow=False,
+ font=dict(
+ family="'Geist Mono', monospace",
+ size=11,
+ color=font_color
+ )
+ )
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ margin=dict(t=80, b=90, l=110, r=160),
+ height=520,
+ width=1450,
+ font=dict(family="'Geist', sans-serif", color="white"),
+ xaxis=dict(
+ tickangle=-25,
+ showgrid=False,
+ ticks="",
+ tickfont=dict(size=11, family="'Geist', sans-serif", color="white")
+ ),
+ yaxis=dict(
+ showgrid=False,
+ ticks="",
+ tickfont=dict(size=12, family="'Geist', sans-serif", color="white")
+ ),
+ annotations=annotations,
+ title=dict(
+ text="Comprehensive Performance Heatmap ",
+ x=0.5,
+ y=0.98,
+ font=dict(
+ size=20,
+ family="'Geist', sans-serif",
+ color="white",
+ weight=700
+ ),
+ )
+ )
+ fig.update_xaxes(side="bottom")
+
+ return fig
+
+
+def create_empty_heatmap(message):
+ """Render an empty state for the heatmap with a centered message."""
+ fig = go.Figure()
+ fig.add_annotation(
+ text=f"🗺️ {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="white",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=520,
+ width=1450,
+ margin=dict(t=80, b=80, l=80, r=160),
+ title=dict(
+ text="Comprehensive Performance Heatmap ",
+ x=0.5,
+ y=0.98,
+ font=dict(
+ size=20,
+ family="'Geist', sans-serif",
+ color="white",
+ weight=700
+ ),
+ )
+ )
+ fig.update_xaxes(visible=False)
+ fig.update_yaxes(visible=False)
+ return fig
+
+
+def create_level_metric_chart(df, level, selected_models=None, max_models=5):
+ """Render a grouped horizontal bar chart showing per-model scores for a level's metrics."""
+ if not level:
+ return create_empty_level_metric_chart("Select a level to view its metrics")
+ df = df.copy()
+ level_prefix = f"{level}_"
+ level_columns = [col for col in df.columns if col.startswith(level_prefix)]
+ metric_columns = []
+ for col in level_columns:
+ metric_suffix = col[len(level_prefix):]
+ metric_key_lower = metric_suffix.lower()
+ if "cost" in metric_key_lower:
+ continue
+ numeric_series = pd.to_numeric(df[col], errors='coerce')
+ valid_values = numeric_series.dropna()
+ if valid_values.empty:
+ continue
+ if (valid_values < 0).any() or (valid_values > 1.05).any():
+ continue
+ df[col] = numeric_series
+ metric_columns.append(col)
+ if not metric_columns:
+ return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize")
+ df = df.drop_duplicates(subset=['Model'])
+ if df.empty:
+ return create_empty_level_metric_chart("No models available to render level metrics")
+ if selected_models:
+ model_order = [m for m in selected_models if m in df['Model'].tolist()]
+ else:
+ sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0]
+ model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist()
+ if not model_order:
+ model_order = df['Model'].tolist()
+ model_order = model_order[:max_models]
+ df_models = df[df['Model'].isin(model_order)].set_index('Model')
+ if df_models.empty:
+ return create_empty_level_metric_chart("No matching models for selected filters")
+ def prettify_metric_name(metric_key):
+ raw = metric_key[len(level_prefix):]
+ text = raw.replace('_', ' ')
+ text = re.sub(r'(?<=.)([A-Z])', r' \1', text)
+ text = text.replace('Avg', 'Average')
+ replacements = {
+ 'Sr': 'SR',
+ 'Ac': 'AC',
+ 'Tsq': 'TSQ',
+ 'Cvr': 'CVR',
+ 'Psm': 'PSM',
+ 'Prov': 'Prov',
+ 'Call Em': 'CallEM',
+ 'Reuse Rate': 'Reuse Rate',
+ 'Eff Score': 'Eff Score'
+ }
+ words = text.title().split()
+ words = [replacements.get(word, word) for word in words]
+ return ' '.join(words)
+ metric_labels = []
+ for col in metric_columns:
+ label = prettify_metric_name(col)
+ if label in metric_labels:
+ suffix = 2
+ while f"{label} ({suffix})" in metric_labels:
+ suffix += 1
+ label = f"{label} ({suffix})"
+ metric_labels.append(label)
+ model_palette = [
+ '#ffd21e',
+ '#FF8A3C',
+ '#F970B9',
+ '#8B5CF6',
+ '#F8FAFC',
+ '#38BDF8',
+ ]
+ fig = go.Figure()
+ max_value = 0
+ for idx, model in enumerate(model_order):
+ values = []
+ for col in metric_columns:
+ value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan')
+ if pd.notna(value):
+ values.append(float(value))
+ max_value = max(max_value, float(value))
+ else:
+ values.append(None)
+ color = model_palette[idx % len(model_palette)]
+ fig.add_trace(
+ go.Bar(
+ name=model,
+ y=metric_labels,
+ x=values,
+ orientation='h',
+ marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)),
+ hovertemplate="%{y} Model · %{fullData.name} Score · %{x:.3f} ",
+ )
+ )
+ plot_height = max(360, 140 + 48 * len(metric_labels))
+ if max_value <= 0:
+ x_range = [0, 1]
+ else:
+ x_range = [0, max_value * 1.05]
+ fig.update_layout(
+ barmode='group',
+ bargap=0.25,
+ bargroupgap=0.18,
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=plot_height,
+ width=1450,
+ margin=dict(t=90, b=80, l=220, r=160),
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=1.02,
+ xanchor="right",
+ x=1,
+ bgcolor='rgba(1, 9, 26, 0.75)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1,
+ font=dict(size=11, family="'Geist', sans-serif", color='white')
+ ),
+ xaxis=dict(
+ title=dict(text=f"{level} Metric Score ", font=dict(size=14, color="white")),
+ tickfont=dict(size=11, color="white"),
+ gridcolor='rgba(245, 246, 247, 0.08)',
+ zerolinecolor='rgba(245, 246, 247, 0.18)',
+ range=x_range
+ ),
+ yaxis=dict(
+ tickfont=dict(size=13, color="white"),
+ automargin=True
+ ),
+ title=dict(
+ text=f"{level} Metric Breakdown ",
+ x=0.5,
+ y=0.98,
+ font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
+ )
+ )
+ return fig
+
+
+def create_empty_level_metric_chart(message):
+ fig = go.Figure()
+ fig.add_annotation(
+ text=f"🧭 {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(size=18, color="white", family="'Geist', sans-serif"),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=420,
+ width=1450,
+ margin=dict(t=80, b=60, l=80, r=120),
+ title=dict(
+ text="Level Metric Breakdown ",
+ x=0.5,
+ y=0.98,
+ font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
+ )
+ )
+ fig.update_xaxes(visible=False)
+ fig.update_yaxes(visible=False)
+ return fig
+
+
+def create_empty_radar_chart(message):
+ """Create an empty radar chart with a message"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ text=f"📊 {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="white",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=1450,
+ width=1450,
+ margin=dict(t=100, b=80, l=80, r=200),
+ title=dict(
+ text="Core Capability Radar ",
+ x=0.5,
+ y=0.97,
+ font=dict(
+ size=22,
+ family="'Geist', sans-serif",
+ color="white",
+ weight=700
+ ),
+ ),
+ annotations=[
+ dict(
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=10, color='#64748B'),
+ showarrow=False
+ )
+ ]
+ )
+
+ return fig
+
+
+# NEW VISUALIZATION FUNCTIONS
+
+def create_cost_performance_scatter(df, metric="Avg AC"):
+ """Create scatter plot showing cost vs performance efficiency"""
+ # Filter out models without cost or performance data
+ df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
+ label_map = {
+ 'Proprietary': 'API',
+ 'Open source': 'OSS'
+ }
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for cost-performance analysis")
+
+ # Convert to numeric
+ df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
+ df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
+ df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
+
+ # Create color mapping for model type
+ color_map = {
+ 'Proprietary': '#1098F7', # Airglow Blue for Proprietary
+ 'Open source': '#58BC82' # Green for Open source
+ }
+ df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
+
+ fig = go.Figure()
+
+ # Add scatter points
+ for model_type in df_filtered['Model Type'].unique():
+ df_type = df_filtered[df_filtered['Model Type'] == model_type]
+ legend_name = label_map.get(model_type, model_type)
+
+ fig.add_trace(go.Scatter(
+ x=df_type[metric],
+ y=df_type['Avg Total Cost'],
+ mode='markers+text',
+ name=legend_name,
+ text=df_type['Model'],
+ textposition="top center",
+ textfont=dict(size=10, color='white'),
+ marker=dict(
+ size=df_type['Avg Turns'] * 3, # Size based on number of turns
+ color=color_map.get(model_type, '#F5F6F7'),
+ opacity=0.8,
+ line=dict(width=2, color='#01091A')
+ ),
+ hovertemplate="%{text} " +
+ f"{metric}: %{{x:.3f}} " +
+ "Cost: $%{y:.3f} " +
+ "Turns: %{marker.size:.1f} " +
+ " "
+ ))
+
+ # Add quadrant lines
+ median_x = df_filtered[metric].median()
+ median_y = df_filtered['Avg Total Cost'].median()
+
+ fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
+ fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
+
+ # Add quadrant labels
+ fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance Low Cost",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)")
+ fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance High Cost",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
+
+ metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Cost-Performance Efficiency: {metric_display} ",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"{metric_display} ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=12, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Average Session Cost ($) ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=12, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=900,
+ width=1450,
+ showlegend=True,
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=1.02,
+ xanchor="right",
+ x=1,
+ font=dict(size=12, family="'Geist', sans-serif", color='white'),
+ bgcolor='rgba(1, 9, 26, 0.8)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1
+ ),
+ margin=dict(t=100, b=80, l=80, r=80)
+ )
+
+ return fig
+
+
+def create_speed_accuracy_plot(df, metric="Avg AC"):
+ """Create scatter plot showing speed vs accuracy trade-off"""
+ # Filter out models without duration or performance data
+ df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for speed-accuracy analysis")
+
+ # Convert to numeric
+ df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
+ df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
+
+ # Create color scale based on cost
+ df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
+
+ fig = go.Figure()
+
+ # Add scatter trace
+ fig.add_trace(go.Scatter(
+ x=df_filtered[metric],
+ y=df_filtered['Avg Session Duration'],
+ mode='markers+text',
+ text=df_filtered['Model'],
+ textposition="top center",
+ textfont=dict(size=9, color='white'),
+ marker=dict(
+ size=12,
+ color=df_filtered['Avg Total Cost'],
+ colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']],
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Cost ($)",
+ font=dict(color="white")
+ ),
+ tickfont=dict(color="white"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ x=1.02
+ ),
+ line=dict(width=2, color='#01091A')
+ ),
+ hovertemplate="%{text} " +
+ f"{metric}: %{{x:.3f}} " +
+ "Duration: %{y:.1f}s " +
+ "Cost: $%{marker.color:.3f} " +
+ " "
+ ))
+
+ # Add quadrant lines
+ median_x = df_filtered[metric].median()
+ median_y = df_filtered['Avg Session Duration'].median()
+
+ fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
+ fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
+
+ # Add quadrant labels
+ fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="white", weight=600))
+ fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#ffd21e", weight=600))
+
+ metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Speed vs Accuracy Trade-off: {metric_display} ",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"{metric_display} ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=12, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Average Session Duration (seconds) ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=12, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=900,
+ width=1450,
+ margin=dict(t=100, b=80, l=80, r=120)
+ )
+
+ return fig
+
+
+def create_domain_specialization_matrix(df, metric_type="AC"):
+ """Create bubble chart showing domain specialization"""
+ domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
+
+ # Prepare data
+ data = []
+ for _, model in df.iterrows():
+ if model['Model'] == '':
+ continue
+
+ model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
+ if pd.isna(model_avg):
+ continue
+
+ for domain in domains:
+ domain_col = f'{domain} {metric_type}'
+ if domain_col in model and model[domain_col] != '':
+ domain_val = pd.to_numeric(model[domain_col], errors='coerce')
+ if not pd.isna(domain_val):
+ # Calculate specialization strength (deviation from model average)
+ specialization = domain_val - model_avg
+ data.append({
+ 'Model': model['Model'],
+ 'Domain': domain,
+ 'Performance': domain_val,
+ 'Specialization': specialization,
+ 'Model Type': model['Model Type']
+ })
+
+ if not data:
+ return create_empty_chart("No domain specialization data available")
+
+ df_plot = pd.DataFrame(data)
+
+ # Create bubble chart
+ fig = go.Figure()
+
+ # Color based on specialization strength
+ fig.add_trace(go.Scatter(
+ x=df_plot['Domain'],
+ y=df_plot['Model'],
+ mode='markers',
+ marker=dict(
+ size=df_plot['Performance'] * 30, # Size based on absolute performance
+ color=df_plot['Specialization'],
+ colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']],
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Specialization Strength",
+ font=dict(color="white")
+ ),
+ tickfont=dict(color="white"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1
+ ),
+ line=dict(width=2, color='#01091A'),
+ opacity=0.8
+ ),
+ text=[f"Performance: {p:.3f} Specialization: {s:+.3f}"
+ for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
+ hovertemplate="%{y} " +
+ "Domain: %{x} " +
+ "%{text} " +
+ " "
+ ))
+
+ metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Domain Specialization Matrix: {metric_display} ",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text="Business Domains ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=13, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Models ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=11, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=1100,
+ width=1450,
+ margin=dict(t=100, b=80, l=220, r=120)
+ )
+
+ return fig
+
+
+def create_performance_gap_analysis(df, metric_type="AC"):
+ """Create range plot showing performance gaps by domain"""
+ domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
+
+ # Calculate min, max, median for each domain
+ gap_data = []
+ for domain in domains:
+ domain_col = f'{domain} {metric_type}'
+ if domain_col in df.columns:
+ domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
+ if len(domain_values) > 0:
+ gap_data.append({
+ 'Domain': domain,
+ 'Min': domain_values.min(),
+ 'Max': domain_values.max(),
+ 'Median': domain_values.median(),
+ 'Q1': domain_values.quantile(0.25),
+ 'Q3': domain_values.quantile(0.75),
+ 'Gap': domain_values.max() - domain_values.min()
+ })
+
+ if not gap_data:
+ return create_empty_chart("No data available for gap analysis")
+
+ df_gap = pd.DataFrame(gap_data)
+ df_gap = df_gap.sort_values('Gap', ascending=True)
+
+ fig = go.Figure()
+
+ # Add range bars
+ for idx, row in df_gap.iterrows():
+ # Add full range line
+ fig.add_trace(go.Scatter(
+ x=[row['Min'], row['Max']],
+ y=[row['Domain'], row['Domain']],
+ mode='lines',
+ line=dict(color='#64748B', width=2),
+ showlegend=False,
+ hoverinfo='skip'
+ ))
+
+ # Add IQR box
+ fig.add_trace(go.Scatter(
+ x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
+ y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
+ fill='toself',
+ fillcolor='rgba(255, 210, 30, 0.3)',
+ line=dict(color='#ffd21e', width=2),
+ showlegend=False,
+ hoverinfo='skip',
+ mode='lines'
+ ))
+
+ # Add median marker
+ fig.add_trace(go.Scatter(
+ x=[row['Median']],
+ y=[row['Domain']],
+ mode='markers',
+ marker=dict(
+ size=12,
+ color='#ffd21e',
+ symbol='diamond',
+ line=dict(width=2, color='#01091A')
+ ),
+ showlegend=False,
+ hovertemplate=f"{row['Domain']} " +
+ f"Min: {row['Min']:.3f} " +
+ f"Q1: {row['Q1']:.3f} " +
+ f"Median: {row['Median']:.3f} " +
+ f"Q3: {row['Q3']:.3f} " +
+ f"Max: {row['Max']:.3f} " +
+ f"Gap: {row['Gap']:.3f} " +
+ " "
+ ))
+
+ # Add min/max points
+ for idx, row in df_gap.iterrows():
+ fig.add_trace(go.Scatter(
+ x=[row['Min'], row['Max']],
+ y=[row['Domain'], row['Domain']],
+ mode='markers',
+ marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')),
+ showlegend=False,
+ hoverinfo='skip'
+ ))
+
+ metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Performance Gap Analysis by Domain: {metric_display} ",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"{metric_display} Score ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=12, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Business Domain ",
+ font=dict(size=16, color="white")
+ ),
+ tickfont=dict(size=13, color="white"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=1450,
+ margin=dict(t=100, b=80, l=140, r=80),
+ showlegend=False
+ )
+
+ # Add legend manually
+ fig.add_annotation(
+ text="◆ Median ━ IQR ─ Full Range",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=12, color='white'),
+ showarrow=False
+ )
+
+ return fig
+
+
+def create_empty_chart(message):
+ """Create an empty chart with a message"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ text=f"📊 {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="white",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=700,
+ width=1450,
+ margin=dict(t=80, b=80, l=80, r=80)
+ )