diff --git "a/tabs/leaderboard_v1.py" "b/tabs/leaderboard_v1.py"
new file mode 100644--- /dev/null
+++ "b/tabs/leaderboard_v1.py"
@@ -0,0 +1,4093 @@
+"""
+Agent Leaderboard v1 - Main leaderboard interface
+Updated implementation with LLM Type support and optimized radar charts
+"""
+
+import base64
+import math
+import re
+from datetime import datetime
+from pathlib import Path
+
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+
+# Import components and styles from modular files
+from components.leaderboard_components import (
+ get_chart_colors, get_rank_badge, get_type_badge,
+ get_metric_tooltip, get_responsive_styles, get_faq_section
+)
+from styles.leaderboard_styles import get_leaderboard_css
+
+ASSET_ICON_PATH = Path("krew_icon.png")
+KREW_ICON_BASE64 = ""
+if ASSET_ICON_PATH.exists():
+ KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8")
+
+CSV_PATH = Path("combined_evaluation_summary.csv")
+if CSV_PATH.exists():
+ EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d")
+else:
+ EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d")
+
+
+def create_leaderboard_v2_tab():
+ """Create the main leaderboard v1 tab with interactive table"""
+ token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens)
+ tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling
+ level_ids = [f"L{i}" for i in range(1, 8)]
+ level_tsq_sources = {
+ "L1": "L1_ArgAcc",
+ "L2": "L2_SelectAcc",
+ "L3": "L3_PSM",
+ "L4": "L4_Coverage",
+ "L5": "L5_AdaptiveRoutingScore",
+ "L6": "L6_EffScore",
+ "L7": "L7_ContextRetention",
+ }
+
+ def load_leaderboard_data():
+ """Load and prepare the leaderboard data"""
+ df = pd.read_csv('combined_evaluation_summary.csv')
+
+ # Clean and prepare data
+ df = df.copy()
+ numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')]
+ for col in numeric_candidate_cols:
+ df[col] = pd.to_numeric(df[col], errors='coerce')
+
+ # Derive per-level helper columns for cost and turns
+ sr_columns = []
+ tsq_columns = []
+ duration_columns = []
+ cost_columns = []
+ turns_columns = []
+
+ for level in level_ids:
+ sr_col = f"{level}_SR"
+ if sr_col in df.columns:
+ sr_columns.append(sr_col)
+ df[sr_col] = df[sr_col].round(3)
+
+ tsq_source = level_tsq_sources.get(level)
+ if tsq_source and tsq_source in df.columns:
+ tsq_columns.append(tsq_source)
+
+ duration_col = f"{level}_Avg_Exec_Time"
+ if duration_col in df.columns:
+ duration_columns.append(duration_col)
+
+ token_col = f"{level}_Avg_Tokens"
+ if token_col in df.columns:
+ cost_col = f"{level}_Avg_Cost"
+ turns_col = f"{level}_Avg_Turns"
+ df[cost_col] = df[token_col] * token_to_cost_factor
+ df[turns_col] = df[token_col] / tokens_per_turn
+ cost_columns.append(cost_col)
+ turns_columns.append(turns_col)
+
+ if sr_columns:
+ df['Avg AC'] = df[sr_columns].mean(axis=1)
+ if tsq_columns:
+ df['Avg TSQ'] = df[tsq_columns].mean(axis=1)
+ if cost_columns:
+ df['Avg Total Cost'] = df[cost_columns].mean(axis=1)
+ if duration_columns:
+ df['Avg Session Duration'] = df[duration_columns].mean(axis=1)
+ if turns_columns:
+ df['Avg Turns'] = df[turns_columns].mean(axis=1)
+
+ # Derive core capability metrics for radar visualization
+ if sr_columns:
+ df['Overall Success'] = df[sr_columns].mean(axis=1)
+ execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns]
+ if execution_cols:
+ df['Execution Accuracy'] = df[execution_cols].mean(axis=1)
+ reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns]
+ if reasoning_cols:
+ df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1)
+ robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns]
+ if robustness_cols:
+ df['Robustness'] = df[robustness_cols].mean(axis=1)
+ context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns]
+ if context_cols:
+ df['Context & Efficiency'] = df[context_cols].mean(axis=1)
+ epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns]
+ if epr_cols:
+ df['Call Validity'] = df[epr_cols].mean(axis=1)
+
+ # Use LLM Type from CSV directly, with mapping to display names
+ if 'LLM Type' in df.columns:
+ # Clean the LLM Type column to remove any whitespace
+ df['LLM Type'] = df['LLM Type'].astype(str).str.strip()
+
+ # Map LLM Type to Model Type
+ def map_llm_type(llm_type):
+ if llm_type.upper() == "OSS":
+ return "Open source"
+ else:
+ return "Proprietary"
+
+ df['Model Type'] = df['LLM Type'].apply(map_llm_type)
+ else:
+ # Fallback to vendor mapping if LLM Type column doesn't exist
+ vendor_model_type_map = {
+ "OpenAI": "Proprietary",
+ "Anthropic": "Proprietary",
+ "Google": "Proprietary",
+ "Microsoft": "Proprietary",
+ "Mistral": "Proprietary",
+ "Databricks": "Open source",
+ "Meta": "Open source",
+ "Alibaba": "Open source",
+ "알리바바": "Open source", # Korean name for Alibaba
+ "Kakao": "Open source",
+ "SKT": "Open source",
+ "KT": "Open source",
+ "xAI": "Proprietary",
+ }
+ df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary')
+
+ # Round numeric columns for better display
+ round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
+ 'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity']
+ round_one_cols = ['Avg Session Duration', 'Avg Turns']
+ for col in round_three_cols:
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
+ for col in round_one_cols:
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(1)
+ if cost_columns:
+ df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3)
+ if turns_columns:
+ df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2)
+ if duration_columns:
+ df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2)
+
+ # Fill NaN values appropriately
+ df = df.fillna('')
+
+ return df
+
+ def build_static_radar_chart(values, labels):
+ """Render a small static radar chart as inline SVG"""
+ if not values or all(v == 0 for v in values):
+ return """
+
+ 레이더 차트
+ 기초 수행력 · 복합 추론력 · 견고성 · 맥락 효율성 · 전반적 성공률 · 기본적 유효성
+
+ """
+ size = 220
+ center = size / 2
+ radius = size * 0.38
+ n = len(values)
+ def point(v, idx, scale=1.0):
+ angle = (2 * math.pi * idx / n) - math.pi / 2
+ r = radius * v * scale
+ x = center + r * math.cos(angle)
+ y = center + r * math.sin(angle)
+ return x, y
+ polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values)))
+ ring_polygons = []
+ for step in (0.33, 0.66, 1.0):
+ ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n)))
+ opacity = 0.04 if step < 1.0 else 0.08
+ ring_polygons.append(f'')
+ axis_lines = "\n".join(
+ f''
+ for idx in range(n)
+ )
+ label_spans = "\n".join(
+ f'{label}'
+ for idx, label in enumerate(labels)
+ )
+ svg = f"""
+
+ """
+ return svg
+
+ # Level metadata for the 7-stage task framework
+ level_details = {
+ "ALL": {
+ "title": "ALL · 전체 태스크",
+ "description": "7개의 태스크 전반의 평균 성능을 한눈에 ���펴보고 각 레벨 비교를 위한 기준점을 제공합니다."
+ },
+ "L1": {
+ "title": "L1 · 단일 도구 실행",
+ "description": "단일 도구 실행 능력과 기본적인 명령 수행 정확도를 평가합니다."
+ },
+ "L2": {
+ "title": "L2 · 도구 선택 능력",
+ "description": "요구 사항에 맞는 도구를 고르고 적절한 파라미터로 호출하는 능력을 측정합니다."
+ },
+ "L3": {
+ "title": "L3 · 순차적 추론 (Chaining)",
+ "description": "복수 단계의 순차적 reasoning을 통해 문제를 해결하는 과정을 검증합니다."
+ },
+ "L4": {
+ "title": "L4 · 병렬적 추론 (Aggregation)",
+ "description": "여러 소스의 정보를 병렬적으로 통합하고 요약하는 능력을 평가합니다."
+ },
+ "L5": {
+ "title": "L5 · 강건성 (Robustness / Fallback)",
+ "description": "예상치 못한 오류나 실패 상황에 대한 인지와 대응 전략을 확인합니다."
+ },
+ "L6": {
+ "title": "L6 · 효율성 (Efficiency)",
+ "description": "최소한의 호출과 비용으로 목표를 달성하는 운영 효율을 살펴봅니다."
+ },
+ "L7": {
+ "title": "L7 · 장기 컨텍스트 기억 (Contextual Memory)",
+ "description": "장기 대화 맥락을 유지하고 적절히 활용하는 능력을 집중적으로 분석합니다."
+ }
+ }
+ default_level = "ALL"
+
+ sr_column_map = {level: f"{level}_SR" for level in level_ids}
+ overall_sort_column = "Overall Success"
+
+ def resolve_level(level_value):
+ """Normalize the incoming level filter value"""
+ if not level_value:
+ return default_level
+ return level_value if level_value in level_details else default_level
+
+ def generate_html_table(filtered_df, highlight_column):
+ """Generate styled HTML table with per-level success rates"""
+ valid_highlights = list(sr_column_map.values()) + ["Overall Success"]
+ highlight_column = highlight_column if highlight_column in valid_highlights else None
+ overall_column = "Overall Success"
+ overall_highlight = (highlight_column == overall_column)
+ highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids}
+
+ table_html = """
+
+
+
+
+
+
+ | Rank |
+ Model |
+ Vendor |
+ LLM Type |
+ """
+ overall_header_classes = ["numeric-cell"]
+ if overall_highlight:
+ overall_header_classes.append("highlight-header")
+ table_html += f"""
+
+ """
+ for level in level_ids:
+ header_classes = ["numeric-cell"]
+ if highlight_map.get(level):
+ header_classes.append("highlight-header")
+ table_html += f"""
+
+ """
+ table_html += """
+
+
+
+ """
+ def safe_float(value):
+ if value is None:
+ return ''
+ if isinstance(value, str) and value.strip() == '':
+ return ''
+ if pd.isna(value):
+ return ''
+ try:
+ return float(value)
+ except (TypeError, ValueError):
+ return ''
+
+ # Generate table rows
+ for idx, (_, row) in enumerate(filtered_df.iterrows()):
+ rank = idx + 1
+ table_html += f"""
+
+ | {get_rank_badge(rank)} |
+ {row['Model']} |
+ {row['Vendor']} |
+ {get_type_badge(row['Model Type'])} |
+ """
+ overall_value = safe_float(row.get(overall_column, ''))
+ if overall_value != '':
+ overall_display = f'{overall_value:.3f}'
+ else:
+ overall_display = '-'
+ overall_classes = ["numeric-cell"]
+ if overall_highlight:
+ overall_classes.append("highlight-cell")
+ table_html += f'{overall_display} | '
+ for level in level_ids:
+ sr_col = sr_column_map[level]
+ value = safe_float(row.get(sr_col, ''))
+ if value != '':
+ value_display = f'{value:.3f}'
+ else:
+ value_display = '-'
+ cell_classes = ["numeric-cell"]
+ if highlight_map.get(level):
+ cell_classes.append("highlight-cell")
+ table_html += f'{value_display} | '
+ table_html += "
"
+
+ table_html += """
+
+
+
+ """
+
+ return table_html
+
+ def update_leaderboard_title(level_filter):
+ """Update the leaderboard title based on selected level"""
+ level_key = resolve_level(level_filter)
+ level_info = level_details.get(level_key, level_details[default_level])
+ level_title = level_info["title"]
+ level_description = level_info["description"]
+
+ return f"""
+
+
+
Agent Leaderboard · {level_title}
+
{level_description}
+
+
+ """
+
+ model_type_lookup = {
+ "OSS": "Open source",
+ "API": "Proprietary"
+ }
+
+ def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"):
+ """Apply shared filters and sorting to the leaderboard dataframe."""
+ filtered_df = df.copy()
+ level_key = resolve_level(level_filter)
+ highlight_column = None
+
+ if model_type_filter != "All":
+ mapped_type = model_type_lookup.get(model_type_filter, model_type_filter)
+ filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type]
+
+ actual_sort_column = sort_by if sort_by in filtered_df.columns else None
+ if not actual_sort_column:
+ if level_key == "ALL":
+ actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None
+ else:
+ actual_sort_column = sr_column_map.get(level_key)
+
+ if level_key in sr_column_map:
+ highlight_column = sr_column_map[level_key]
+ elif level_key == "ALL" and overall_sort_column in filtered_df.columns:
+ highlight_column = overall_sort_column
+
+ if actual_sort_column and actual_sort_column in filtered_df.columns:
+ ascending = (sort_order == "Ascending")
+ filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
+
+ return filtered_df, level_key, highlight_column
+
+ def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order):
+ """Filter and sort the leaderboard data"""
+ df = load_leaderboard_data()
+
+ filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by)
+
+ # Generate HTML table
+ return generate_html_table(filtered_df, highlight_column)
+
+ # Load initial data
+ initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
+ initial_df = load_leaderboard_data() # Load raw data for model selector
+ initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
+ initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
+ initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
+ initial_level_metric_level = level_ids[0] if level_ids else None
+ initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else []
+ initial_level_model_values = initial_level_model_choices[:5]
+ initial_level_metric_chart = create_level_metric_chart(
+ initial_df,
+ initial_level_metric_level,
+ initial_level_model_values
+ ) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available")
+
+ # Load custom CSS and responsive styles
+ custom_css = get_leaderboard_css() + get_responsive_styles() + """
+
+
+
+ """
+
+ gr.HTML(custom_css)
+
+ # Header styles and navigation
+ gr.HTML("""
+
+ """)
+
+ gr.Image(
+ value="banner.png",
+ show_label=False,
+ interactive=False,
+ type="filepath",
+ elem_id="hero-banner"
+ )
+
+ gr.HTML("""
+
+
Hugging Face KREW Ko-AgentBench
+
한국 실사용 환경 특화 에이전트 벤치마크
+
+ """)
+
+ # Links section below title
+ gr.HTML("""
+
+ """)
+
+ # Section 1: 단계별 태스크 설계
+ gr.HTML("""
+
+
+
단순 도구 호출부터 장기적 맥락 능력, 강건성 처리 능력까지 에이전트의 능력을 7단계로 입체적으로 분석하였습니다.
+
+
+
Single-Turn
+
+ 80%
+
+
+ - L1: 단일 도구 실행
+ - L2: 도구 선택 능력
+ - L3: 순차적 reasoning (Chaining)
+ - L4: 병렬적 reasoning (Aggregation)
+ - L5: 강건성 (Robustness / Fallback)
+
+
+
+
Multi-Turn
+
+ 20%
+
+
+ - L6: 효율성 (Efficiency)
+ - L7: 장기 컨텍스트 기억 (Contextual Memory)
+
+
+
+
+ """)
+
+ # Section 2: 핵심 시나리오 구성
+ gr.HTML("""
+
+
+
+
네이버, 지도, 카카오, 웹사이트 등 한국 실사용 환경 기반의 API를 기반으로 국내 사용자의 일상과 밀접한 '약속 예약', '블로그 후기 검색'과 같은 현실적인 문제 해결 시나리오를 구현했습니다.
+
+
⌄
+
+ """)
+
+ # Section 3: 핵심 평가 기준
+ gr.HTML("""
+
+
+
+
+
핵심 기반 반복 평가
+
+ - 실패 API 응답 개선
+ - '정보 속성 불일치성 변경' 등 기존 벤치마크의 고질적 문제 해결
+ - 벤치마크의 일관성과 신뢰도 보장
+
+
+
+
강건성 테스트
+
+ - 의도된 오류 상황(상품 단종)의 오류 인식/대응 능력(전략)까지 평가
+ - 현실 환경에서도 안정적으로 작동하는 모델 선별
+
+
+
+
단계별 고유 정밀 지표
+
+ - 도구 선택, 파라미터 구성, 데이터 흐름 등 문제 해결의 불필요/소요 단계별 평가
+ - 모델의 강/약점 정량적으로 식별
+
+
+
+
+ """)
+
+ # Metrics overview cards removed per updated design
+
+ # Domain filter section with enhanced styling
+ gr.HTML("""
+
+
+ """)
+
+ level_options = list(level_details.keys())
+
+ with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
+ gr.HTML("""
+
+
🧠 Select Task Level
+
Ko-AgentBench의 ALL · L1~L7 단계별 에이전트 성능을 손쉽게 비교하세요.
+
+ """)
+ domain_filter = gr.Radio(
+ choices=level_options,
+ value=default_level,
+ label="",
+ interactive=True,
+ container=False,
+ elem_classes=["domain-radio"]
+ )
+
+ # Filter controls with domain styling
+ with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
+ gr.HTML("""
+
+
🔍 Filters & Sorting
+
모델 접근 방식과 정렬 순서를 선택해 맞춤 뷰를 구성하세요.
+
+ """)
+ with gr.Row(elem_classes=["filters-sorting-row"]):
+ with gr.Column(scale=1, elem_classes=["filter-group"]):
+ with gr.Row(elem_classes=["filter-group-row"]):
+ gr.HTML("
Model Access")
+ model_type_filter = gr.Radio(
+ choices=["All", "OSS", "API"],
+ value="All",
+ label="",
+ elem_classes=["domain-radio"],
+ container=False
+ )
+ with gr.Column(scale=1, elem_classes=["filter-group"]):
+ with gr.Row(elem_classes=["filter-group-row"]):
+ gr.HTML("
Sort Order")
+ sort_order = gr.Radio(
+ choices=["Descending", "Ascending"],
+ value="Descending",
+ label="",
+ elem_classes=["domain-radio"],
+ container=False
+ )
+
+ # Main leaderboard table with dynamic title
+ leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
+
+ leaderboard_table = gr.HTML(initial_table)
+
+ gr.HTML("""
+
+
""")
+
+ # Radar Chart Section
+ gr.HTML("""
+
+
+
Core Capability Radar
+
Track six essential pillars: Success, Execution, Reasoning, Robustness, Efficiency, and Call Validity.
+
+ """)
+
+ with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
+ gr.HTML("""
+
+
🎯 Select Models for Comparison
+
Choose up to 5 models to map on the capability radar.
+
+ """)
+ model_selector = gr.Dropdown(
+ choices=initial_df['Model'].tolist()[:10],
+ value=initial_df['Model'].tolist()[:5],
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ )
+
+ # Radar chart plot - wrapped in centered container
+ gr.HTML('
')
+ radar_chart = gr.Plot(
+ label="",
+ value=create_domain_radar_chart(
+ load_leaderboard_data(),
+ initial_df['Model'].tolist()[:5]
+ ),
+ elem_classes=["radar-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # Level metric breakdown section
+ gr.HTML("""
+
+
+
Level-Specific Metric Spotlight
+
Dive deeper into each Ko-AgentBench stage and compare model scores across its unique evaluation metrics.
+
+ """)
+
+ with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
+ gr.HTML("""
+
+
🧭 Select Task Level and Models
+
Choose a level and up to 5 models to explore their detailed SR-driven metrics.
+
+ """)
+ level_metric_selector = gr.Dropdown(
+ choices=level_ids,
+ value=level_ids[0] if level_ids else None,
+ multiselect=False,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["level-dropdown"]
+ )
+ level_model_selector = gr.Dropdown(
+ choices=initial_level_model_choices,
+ value=initial_level_model_values,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ )
+
+ gr.HTML('
')
+ level_metric_chart = gr.Plot(
+ label="",
+ value=initial_level_metric_chart,
+ elem_classes=["level-metric-plot", "plot-container"]
+ )
+ gr.HTML("""
+
+
+ """)
+
+ # Heatmap section
+ gr.HTML("""
+
+
+
Comprehensive Performance Heatmap
+
View Ko-AgentBench SR scores across L1~L7 for each model in a single glance.
+
+
+ """)
+ heatmap_chart = gr.Plot(
+ label="",
+ value=initial_heatmap,
+ elem_classes=["heatmap-plot", "plot-container"]
+ )
+ gr.HTML("""
+
+
+ """)
+
+ # Update functions
+ def get_optimal_sort_order(sort_by_value):
+ """Return the optimal sort order for a given metric"""
+ # Metrics where higher is better (descending)
+ descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
+
+ # Metrics where lower is better (ascending)
+ ascending_metrics = []
+
+ if sort_by_value in descending_metrics:
+ return "Descending"
+ elif sort_by_value in ascending_metrics:
+ return "Ascending"
+ else:
+ return "Descending" # Default fallback
+
+ def update_table(level_filter, model_type_filter, sort_order):
+ title_html = update_leaderboard_title(level_filter)
+ sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
+ table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
+ return title_html, table_html
+
+ def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+ # Get filtered dataframe
+ df = load_leaderboard_data()
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+
+ # Update model selector choices based on filtered data
+ available_models_all = filtered_df['Model'].tolist()
+ available_models = available_models_all[:15] # Top 15 from filtered results
+
+ # If selected models are not in available models, reset to top 5
+ if selected_models:
+ valid_selected = [m for m in selected_models if m in available_models]
+ if not valid_selected:
+ valid_selected = available_models[:5]
+ else:
+ valid_selected = available_models[:5]
+
+ # Create radar chart
+ chart = create_domain_radar_chart(filtered_df, valid_selected)
+
+ # Prepare heatmap order prioritizing selected models
+ heatmap_order = []
+ for model in valid_selected:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ for model in available_models_all:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ heatmap_order = heatmap_order[:12]
+ heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order)
+
+ # Level metric chart
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+ available_level_models = available_models_all
+ if level_selected_models:
+ valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
+ if not valid_level_models:
+ valid_level_models = available_level_models[:5]
+ else:
+ valid_level_models = available_level_models[:5]
+ level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+
+ return (
+ gr.Dropdown(
+ choices=available_models,
+ value=valid_selected,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ ),
+ chart,
+ heatmap_fig,
+ gr.Dropdown(
+ choices=available_level_models,
+ value=valid_level_models,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ ),
+ level_metric_fig,
+ )
+
+ def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+ # Get filtered dataframe
+ df = load_leaderboard_data()
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+
+ available_models_all = filtered_df['Model'].tolist()
+ if selected_models:
+ valid_selected = [m for m in selected_models if m in available_models_all]
+ if not valid_selected:
+ valid_selected = available_models_all[:5]
+ else:
+ valid_selected = available_models_all[:5]
+
+ heatmap_order = []
+ for model in valid_selected:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ for model in available_models_all:
+ if model not in heatmap_order:
+ heatmap_order.append(model)
+ heatmap_order = heatmap_order[:12]
+
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+ available_level_models = available_models_all
+ if level_selected_models:
+ valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
+ if not valid_level_models:
+ valid_level_models = available_level_models[:5]
+ else:
+ valid_level_models = available_level_models[:5]
+ level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+
+ return (
+ create_domain_radar_chart(filtered_df, valid_selected),
+ create_performance_heatmap(filtered_df, heatmap_order),
+ gr.Dropdown(
+ choices=available_level_models,
+ value=valid_level_models,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ ),
+ level_metric_fig,
+ )
+
+ def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+ df = load_leaderboard_data()
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+ available_models = filtered_df['Model'].tolist()
+ if level_selected_models:
+ valid_level_models = [m for m in level_selected_models if m in available_models][:5]
+ if not valid_level_models:
+ valid_level_models = available_models[:5]
+ else:
+ valid_level_models = available_models[:5]
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+ level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+ return (
+ gr.Dropdown(
+ choices=available_models,
+ value=valid_level_models,
+ multiselect=True,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown", "level-model-dropdown"]
+ ),
+ level_chart,
+ )
+
+ # Update table when filters change
+ filter_inputs = [domain_filter, model_type_filter, sort_order]
+
+ for input_component in filter_inputs:
+ input_component.change(
+ fn=update_table,
+ inputs=filter_inputs,
+ outputs=[leaderboard_title, leaderboard_table]
+ )
+
+ # Also update radar chart when filters change
+ input_component.change(
+ fn=update_radar_chart,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
+ )
+
+ # Update radar chart when model selection changes
+ model_selector.change(
+ fn=update_radar_only,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
+ )
+
+ level_metric_selector.change(
+ fn=update_level_metric_only,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[level_model_selector, level_metric_chart]
+ )
+
+ level_model_selector.change(
+ fn=update_level_metric_only,
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+ outputs=[level_model_selector, level_metric_chart]
+ )
+
+ # Define generate_performance_card function before using it
+ def generate_performance_card(model_name):
+ """Generate HTML for the model performance card"""
+ if not model_name:
+ return """
+ Please select a model to generate its performance card
+
"""
+
+ # Get model data
+ df = load_leaderboard_data()
+ model_data = df[df['Model'] == model_name]
+
+ if model_data.empty:
+ return """
+ Model not found in the database
+
"""
+
+ row = model_data.iloc[0]
+
+ # Get overall rank based on overall success
+ df_with_success = df.copy()
+ df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
+ df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
+ df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
+ try:
+ rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
+ except:
+ rank = 'N/A'
+
+ # Format values
+ def format_value(val, decimals=3, prefix='', suffix=''):
+ if pd.isna(val) or val == '':
+ return 'N/A'
+ return f"{prefix}{float(val):.{decimals}f}{suffix}"
+
+ def format_score(value):
+ if pd.isna(value) or value == '':
+ return 'N/A'
+ return f"{float(value):.3f}"
+
+ radar_metrics = [
+ ("기초 수행력", row.get('Execution Accuracy')),
+ ("복합 추론력", row.get('Complex Reasoning')),
+ ("견고성", row.get('Robustness')),
+ ("맥락 효율성", row.get('Context & Efficiency')),
+ ("전반적 성공률", row.get('Overall Success')),
+ ("기본적 유효성", row.get('Call Validity')),
+ ]
+ radar_values = []
+ radar_labels = []
+ for label, value in radar_metrics:
+ if pd.isna(value) or value == '':
+ radar_values.append(0.0)
+ else:
+ try:
+ radar_values.append(max(0.0, min(1.0, float(value))))
+ except (TypeError, ValueError):
+ radar_values.append(0.0)
+ radar_labels.append(label)
+
+ mini_radar_html = build_static_radar_chart(radar_values, radar_labels)
+
+ level_blocks = []
+ for level in level_ids:
+ sr_col = sr_column_map.get(level)
+ level_blocks.append((level, row.get(sr_col, '')))
+
+ evaluation_date = EVALUATION_DATE
+ icon_html = ""
+ if KREW_ICON_BASE64:
+ icon_html = f'
'
+ else:
+ icon_html = '🤖
'
+
+ card_html = f"""
+
+ """
+
+ return card_html
+
+ # MODEL PERFORMANCE CARD SECTION
+ gr.HTML("""
+
+
+
Model Performance Card
+
Comprehensive performance card for any model - perfect for presentations and reports
+
+
+ """)
+
+ with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
+ gr.HTML("""
+
+
🤖 Select Model
+
비교할 모델을 선택하세요.
+
+ """)
+ card_model_selector = gr.Dropdown(
+ choices=initial_df['Model'].tolist(),
+ value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ )
+ download_card_btn = gr.Button(
+ "Download Card as PNG",
+ elem_id="download-card-btn",
+ elem_classes=["pill-button"]
+ )
+
+ gr.HTML("""
+
+ """)
+
+ # Card display area - generate initial card
+ initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
+ initial_card_html = generate_performance_card(initial_model) if initial_model else ""
+ card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html")
+
+ gr.HTML("""
+
+
+
+ """)
+
+ # Add custom CSS for the performance card
+ gr.HTML("""
+
+
+ """)
+
+
+ # Wire up the card generator to selection change
+ card_model_selector.change(
+ fn=generate_performance_card,
+ inputs=[card_model_selector],
+ outputs=[card_display]
+ )
+
+ # Wire up download button with html2canvas capture
+ download_card_btn.click(
+ fn=None,
+ js="""
+ async () => {
+ const ensureHtml2Canvas = () => new Promise((resolve, reject) => {
+ if (window.html2canvas) {
+ resolve(window.html2canvas);
+ return;
+ }
+ const existing = document.querySelector('script[data-html2canvas]');
+ if (existing) {
+ existing.addEventListener('load', () => resolve(window.html2canvas));
+ existing.addEventListener('error', reject);
+ return;
+ }
+ const script = document.createElement('script');
+ script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js';
+ script.async = true;
+ script.dataset.html2canvas = 'true';
+ script.onload = () => resolve(window.html2canvas);
+ script.onerror = () => reject(new Error('Failed to load html2canvas'));
+ document.head.appendChild(script);
+ });
+
+ const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms));
+ await pause(60);
+
+ const card = document.querySelector('.performance-card');
+ if (!card) {
+ alert('Performance card not found. Please select a model first.');
+ return;
+ }
+
+ const btn = document.getElementById('download-card-btn');
+ const originalText = btn?.textContent || '';
+ if (btn) {
+ btn.textContent = 'Generating...';
+ btn.disabled = true;
+ }
+
+ try {
+ const html2canvasLib = await ensureHtml2Canvas();
+ if (!html2canvasLib) {
+ throw new Error('html2canvas unavailable');
+ }
+
+ const canvas = await html2canvasLib(card, {
+ backgroundColor: '#01091A',
+ scale: 2,
+ logging: false,
+ useCORS: true
+ });
+
+ const link = document.createElement('a');
+ const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
+ const timestamp = new Date().toISOString().slice(0, 10);
+ const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
+ link.download = fileName;
+ link.href = canvas.toDataURL('image/png');
+ document.body.appendChild(link);
+ link.click();
+ document.body.removeChild(link);
+ } catch (error) {
+ console.error('Error capturing card:', error);
+ alert('Failed to capture performance card. Please try again.');
+ } finally {
+ if (btn) {
+ btn.textContent = originalText;
+ btn.disabled = false;
+ }
+ }
+ }
+ """
+ )
+
+ # Also update card when filters change to keep model selector in sync
+ for input_component in filter_inputs:
+ def update_dropdown_and_card(*args):
+ filtered_df, _, _ = apply_filters(
+ load_leaderboard_data(),
+ args[0],
+ args[1],
+ args[2],
+ "Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success")
+ )
+ choices = filtered_df['Model'].tolist()
+ # Select first model from filtered list
+ value = choices[0] if choices else None
+ return gr.Dropdown(
+ choices=choices,
+ value=value,
+ label="",
+ info=None,
+ container=False,
+ elem_classes=["model-dropdown"]
+ )
+
+ input_component.change(
+ fn=update_dropdown_and_card,
+ inputs=filter_inputs,
+ outputs=[card_model_selector]
+ )
+
+ return leaderboard_table
+
+
+def create_leaderboard_v2_interface():
+ """Create the complete leaderboard v1 interface"""
+ return create_leaderboard_v2_tab()
+
+
+def create_domain_radar_chart(df, selected_models=None, max_models=5):
+ """Visualize six core capability metrics on a radar chart."""
+ df = df.copy()
+ metrics_info = [
+ {"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
+ {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
+ {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
+ {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
+ {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
+ {"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
+ ]
+
+ required_columns = [m["column"] for m in metrics_info]
+ if df.empty or not any(col in df.columns for col in required_columns):
+ return create_empty_radar_chart("Not enough data to build the capability radar")
+
+ # Default model selection
+ if not selected_models:
+ if "Overall Success" in df.columns:
+ top_models = df.sort_values("Overall Success", ascending=False)
+ else:
+ top_models = df
+ selected_models = top_models['Model'].head(max_models).tolist()
+
+ selected_models = selected_models[:max_models]
+
+ # Ensure metric columns are numeric
+ for metric in metrics_info:
+ col = metric["column"]
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors='coerce')
+
+ fig = go.Figure()
+ angle_labels = [m["label"] for m in metrics_info]
+
+ palette = [
+ {'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
+ {'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
+ {'fill': 'rgba(249, 112, 185, 0.22)', 'line': '#F970B9'},
+ {'fill': 'rgba(139, 92, 246, 0.20)', 'line': '#8B5CF6'},
+ {'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
+ ]
+
+ for idx, model_name in enumerate(selected_models):
+ model_data = df[df['Model'] == model_name]
+ if model_data.empty:
+ continue
+
+ row = model_data.iloc[0]
+ values = []
+ tooltips = []
+ for metric in metrics_info:
+ col = metric["column"]
+ value = row[col] if col in row else float('nan')
+ if pd.isna(value) or value == '':
+ value = 0
+ values.append(float(value))
+ tooltips.append(metric["description"])
+
+ if not values:
+ continue
+
+ values_loop = values + [values[0]]
+ angles_loop = angle_labels + [angle_labels[0]]
+ tooltips_loop = tooltips + [tooltips[0]]
+ colors = palette[idx % len(palette)]
+
+ fig.add_trace(
+ go.Scatterpolar(
+ r=values_loop,
+ theta=angles_loop,
+ fill='toself',
+ fillcolor=colors['fill'],
+ line=dict(color=colors['line'], width=3),
+ marker=dict(
+ size=10,
+ color=colors['line'],
+ symbol='circle',
+ line=dict(width=2, color='#01091A')
+ ),
+ name=model_name,
+ customdata=tooltips_loop,
+ mode="lines+markers",
+ hovertemplate="%{fullData.name}
" +
+ "%{theta}
" +
+ "%{customdata}
" +
+ "%{r:.3f}
" +
+ "",
+ hoverlabel=dict(
+ bgcolor="rgba(1, 9, 26, 0.95)",
+ bordercolor=colors['line'],
+ font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif")
+ )
+ )
+ )
+
+ tick_vals = [i / 5 for i in range(6)]
+ tick_text = [f"{val:.2f}" for val in tick_vals]
+
+ fig.update_layout(
+ polar=dict(
+ bgcolor='rgba(245, 246, 247, 0.03)',
+ radialaxis=dict(
+ visible=True,
+ range=[0, 1],
+ showline=True,
+ linewidth=2,
+ linecolor='rgba(245, 246, 247, 0.2)',
+ gridcolor='rgba(245, 246, 247, 0.1)',
+ gridwidth=1,
+ tickvals=tick_vals,
+ ticktext=tick_text,
+ tickfont=dict(
+ size=11,
+ color='#94A3B8',
+ family="'Geist Mono', monospace"
+ )
+ ),
+ angularaxis=dict(
+ showline=True,
+ linewidth=2,
+ linecolor='rgba(245, 246, 247, 0.2)',
+ gridcolor='rgba(245, 246, 247, 0.08)',
+ tickfont=dict(
+ size=13,
+ family="'Geist', sans-serif",
+ color='#F5F6F7',
+ weight=600
+ ),
+ rotation=90,
+ direction="clockwise",
+ ),
+ ),
+ showlegend=True,
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=-0.15,
+ xanchor="center",
+ x=0.5,
+ font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'),
+ bgcolor='rgba(1, 9, 26, 0.8)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1,
+ itemsizing='constant',
+ itemwidth=30
+ ),
+ title=dict(
+ text="Core Capability Radar",
+ x=0.5,
+ y=0.97,
+ font=dict(
+ size=22,
+ family="'Geist', sans-serif",
+ color="#F5F6F7",
+ weight=700
+ ),
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=900,
+ margin=dict(t=30, b=50, l=10, r=10),
+ autosize=True,
+ annotations=[
+ dict(
+ text="Galileo Agent Leaderboard",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=10, color='#64748B'),
+ showarrow=False
+ )
+ ]
+ )
+
+ return fig
+
+
+def create_performance_heatmap(df, ordered_models=None, max_models=12):
+ """Render a heatmap of SR scores across task levels for selected models."""
+ df = df.copy()
+ level_sequence = [f"L{i}" for i in range(1, 8)]
+ sr_columns = []
+ for level in level_sequence:
+ col = f"{level}_SR"
+ if col in df.columns:
+ df[col] = pd.to_numeric(df[col], errors="coerce")
+ sr_columns.append((level, col))
+ if df.empty or not sr_columns:
+ return create_empty_heatmap("Not enough SR data to render the heatmap")
+
+ df = df.drop_duplicates(subset=["Model"])
+ if df.empty:
+ return create_empty_heatmap("No models available to render the heatmap")
+
+ sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0]
+ df = df.sort_values(sort_column, ascending=False)
+
+ if ordered_models:
+ ordered_models = [m for m in ordered_models if m in df["Model"].tolist()]
+ else:
+ ordered_models = df["Model"].tolist()
+
+ if not ordered_models:
+ return create_empty_heatmap("No models available to render the heatmap")
+
+ ordered_models = ordered_models[:max_models]
+ heatmap_df = df.set_index("Model").reindex(ordered_models)
+
+ level_labels = []
+ z_matrix = []
+ has_values = False
+
+ for level, col in sr_columns:
+ if col not in heatmap_df.columns:
+ continue
+ label = f"{level} · SR"
+ level_labels.append(label)
+ row_values = []
+ for model in ordered_models:
+ value = heatmap_df.at[model, col] if model in heatmap_df.index else None
+ if pd.isna(value):
+ row_values.append(None)
+ else:
+ val = float(value)
+ row_values.append(val)
+ has_values = True
+ z_matrix.append(row_values)
+
+ if not level_labels or not has_values:
+ return create_empty_heatmap("Not enough SR data to render the heatmap")
+
+ colorscale = [
+ [0.0, "#0A0A0A"],
+ [0.25, "#1A1411"],
+ [0.5, "#332818"],
+ [0.75, "#B8660A"],
+ [1.0, "#FFD21E"],
+ ]
+
+ fig = go.Figure()
+ fig.add_trace(
+ go.Heatmap(
+ z=z_matrix,
+ x=ordered_models,
+ y=level_labels,
+ colorscale=colorscale,
+ zmin=0,
+ zmax=1,
+ hovertemplate="%{y}
%{x}
SR · %{z:.3f}",
+ colorbar=dict(
+ title="Success Rate",
+ titlefont=dict(color="#F5F6F7", family="'Geist', sans-serif", size=12),
+ tickfont=dict(color="#94A3B8", family="'Geist', sans-serif", size=10),
+ thickness=12,
+ len=0.7,
+ outlinecolor="rgba(255, 255, 255, 0.1)",
+ bgcolor="rgba(1, 9, 26, 0.75)"
+ ),
+ showscale=True
+ )
+ )
+
+ annotations = []
+ for y_idx, level in enumerate(level_labels):
+ for x_idx, model in enumerate(ordered_models):
+ value = z_matrix[y_idx][x_idx]
+ if value is None:
+ continue
+ font_color = "#0B1120" if value >= 0.6 else "#F8FAFC"
+ annotations.append(
+ dict(
+ x=model,
+ y=level,
+ text=f"{value:.3f}",
+ showarrow=False,
+ font=dict(
+ family="'Geist Mono', monospace",
+ size=11,
+ color=font_color
+ )
+ )
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ margin=dict(t=80, b=90, l=110, r=160),
+ height=520,
+ width=1450,
+ font=dict(family="'Geist', sans-serif", color="#F5F6F7"),
+ xaxis=dict(
+ tickangle=-25,
+ showgrid=False,
+ ticks="",
+ tickfont=dict(size=11, family="'Geist', sans-serif", color="#F5F6F7")
+ ),
+ yaxis=dict(
+ showgrid=False,
+ ticks="",
+ tickfont=dict(size=12, family="'Geist', sans-serif", color="#F5F6F7")
+ ),
+ annotations=annotations,
+ title=dict(
+ text="Comprehensive Performance Heatmap",
+ x=0.5,
+ y=0.98,
+ font=dict(
+ size=20,
+ family="'Geist', sans-serif",
+ color="#F5F6F7",
+ weight=700
+ ),
+ )
+ )
+ fig.update_xaxes(side="bottom")
+
+ return fig
+
+
+def create_empty_heatmap(message):
+ """Render an empty state for the heatmap with a centered message."""
+ fig = go.Figure()
+ fig.add_annotation(
+ text=f"🗺️ {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="#94A3B8",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=520,
+ width=1450,
+ margin=dict(t=80, b=80, l=80, r=160),
+ title=dict(
+ text="Comprehensive Performance Heatmap",
+ x=0.5,
+ y=0.98,
+ font=dict(
+ size=20,
+ family="'Geist', sans-serif",
+ color="#F5F6F7",
+ weight=700
+ ),
+ )
+ )
+ fig.update_xaxes(visible=False)
+ fig.update_yaxes(visible=False)
+ return fig
+
+
+def create_level_metric_chart(df, level, selected_models=None, max_models=5):
+ """Render a grouped horizontal bar chart showing per-model scores for a level's metrics."""
+ if not level:
+ return create_empty_level_metric_chart("Select a level to view its metrics")
+ df = df.copy()
+ level_prefix = f"{level}_"
+ level_columns = [col for col in df.columns if col.startswith(level_prefix)]
+ metric_columns = []
+ for col in level_columns:
+ metric_suffix = col[len(level_prefix):]
+ metric_key_lower = metric_suffix.lower()
+ if "cost" in metric_key_lower:
+ continue
+ numeric_series = pd.to_numeric(df[col], errors='coerce')
+ valid_values = numeric_series.dropna()
+ if valid_values.empty:
+ continue
+ if (valid_values < 0).any() or (valid_values > 1.05).any():
+ continue
+ df[col] = numeric_series
+ metric_columns.append(col)
+ if not metric_columns:
+ return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize")
+ df = df.drop_duplicates(subset=['Model'])
+ if df.empty:
+ return create_empty_level_metric_chart("No models available to render level metrics")
+ if selected_models:
+ model_order = [m for m in selected_models if m in df['Model'].tolist()]
+ else:
+ sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0]
+ model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist()
+ if not model_order:
+ model_order = df['Model'].tolist()
+ model_order = model_order[:max_models]
+ df_models = df[df['Model'].isin(model_order)].set_index('Model')
+ if df_models.empty:
+ return create_empty_level_metric_chart("No matching models for selected filters")
+ def prettify_metric_name(metric_key):
+ raw = metric_key[len(level_prefix):]
+ text = raw.replace('_', ' ')
+ text = re.sub(r'(?<=.)([A-Z])', r' \1', text)
+ text = text.replace('Avg', 'Average')
+ replacements = {
+ 'Sr': 'SR',
+ 'Ac': 'AC',
+ 'Tsq': 'TSQ',
+ 'Cvr': 'CVR',
+ 'Psm': 'PSM',
+ 'Prov': 'Prov',
+ 'Call Em': 'CallEM',
+ 'Reuse Rate': 'Reuse Rate',
+ 'Eff Score': 'Eff Score'
+ }
+ words = text.title().split()
+ words = [replacements.get(word, word) for word in words]
+ return ' '.join(words)
+ metric_labels = []
+ for col in metric_columns:
+ label = prettify_metric_name(col)
+ if label in metric_labels:
+ suffix = 2
+ while f"{label} ({suffix})" in metric_labels:
+ suffix += 1
+ label = f"{label} ({suffix})"
+ metric_labels.append(label)
+ model_palette = [
+ '#ffd21e',
+ '#FF8A3C',
+ '#F970B9',
+ '#8B5CF6',
+ '#F8FAFC',
+ '#38BDF8',
+ ]
+ fig = go.Figure()
+ max_value = 0
+ for idx, model in enumerate(model_order):
+ values = []
+ for col in metric_columns:
+ value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan')
+ if pd.notna(value):
+ values.append(float(value))
+ max_value = max(max_value, float(value))
+ else:
+ values.append(None)
+ color = model_palette[idx % len(model_palette)]
+ fig.add_trace(
+ go.Bar(
+ name=model,
+ y=metric_labels,
+ x=values,
+ orientation='h',
+ marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)),
+ hovertemplate="%{y}
Model · %{fullData.name}
Score · %{x:.3f}",
+ )
+ )
+ plot_height = max(360, 140 + 48 * len(metric_labels))
+ if max_value <= 0:
+ x_range = [0, 1]
+ else:
+ x_range = [0, max_value * 1.05]
+ fig.update_layout(
+ barmode='group',
+ bargap=0.25,
+ bargroupgap=0.18,
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=plot_height,
+ width=1450,
+ margin=dict(t=90, b=80, l=220, r=160),
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=1.02,
+ xanchor="right",
+ x=1,
+ bgcolor='rgba(1, 9, 26, 0.75)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1,
+ font=dict(size=11, family="'Geist', sans-serif", color='#F5F6F7')
+ ),
+ xaxis=dict(
+ title=dict(text=f"{level} Metric Score", font=dict(size=14, color="#F5F6F7")),
+ tickfont=dict(size=11, color="#94A3B8"),
+ gridcolor='rgba(245, 246, 247, 0.08)',
+ zerolinecolor='rgba(245, 246, 247, 0.18)',
+ range=x_range
+ ),
+ yaxis=dict(
+ tickfont=dict(size=13, color="#F5F6F7"),
+ automargin=True
+ ),
+ title=dict(
+ text=f"{level} Metric Breakdown",
+ x=0.5,
+ y=0.98,
+ font=dict(size=20, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ )
+ )
+ return fig
+
+
+def create_empty_level_metric_chart(message):
+ fig = go.Figure()
+ fig.add_annotation(
+ text=f"🧭 {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(size=18, color="#94A3B8", family="'Geist', sans-serif"),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=420,
+ width=1450,
+ margin=dict(t=80, b=60, l=80, r=120),
+ title=dict(
+ text="Level Metric Breakdown",
+ x=0.5,
+ y=0.98,
+ font=dict(size=20, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ )
+ )
+ fig.update_xaxes(visible=False)
+ fig.update_yaxes(visible=False)
+ return fig
+
+
+def create_empty_radar_chart(message):
+ """Create an empty radar chart with a message"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ text=f"📊 {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="#94A3B8",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=1450,
+ width=1450,
+ margin=dict(t=100, b=80, l=80, r=200),
+ title=dict(
+ text="Core Capability Radar",
+ x=0.5,
+ y=0.97,
+ font=dict(
+ size=22,
+ family="'Geist', sans-serif",
+ color="#F5F6F7",
+ weight=700
+ ),
+ ),
+ annotations=[
+ dict(
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=10, color='#64748B'),
+ showarrow=False
+ )
+ ]
+ )
+
+ return fig
+
+
+# NEW VISUALIZATION FUNCTIONS
+
+def create_cost_performance_scatter(df, metric="Avg AC"):
+ """Create scatter plot showing cost vs performance efficiency"""
+ # Filter out models without cost or performance data
+ df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
+ label_map = {
+ 'Proprietary': 'API',
+ 'Open source': 'OSS'
+ }
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for cost-performance analysis")
+
+ # Convert to numeric
+ df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
+ df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
+ df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
+
+ # Create color mapping for model type
+ color_map = {
+ 'Proprietary': '#1098F7', # Airglow Blue for Proprietary
+ 'Open source': '#58BC82' # Green for Open source
+ }
+ df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
+
+ fig = go.Figure()
+
+ # Add scatter points
+ for model_type in df_filtered['Model Type'].unique():
+ df_type = df_filtered[df_filtered['Model Type'] == model_type]
+ legend_name = label_map.get(model_type, model_type)
+
+ fig.add_trace(go.Scatter(
+ x=df_type[metric],
+ y=df_type['Avg Total Cost'],
+ mode='markers+text',
+ name=legend_name,
+ text=df_type['Model'],
+ textposition="top center",
+ textfont=dict(size=10, color='#94A3B8'),
+ marker=dict(
+ size=df_type['Avg Turns'] * 3, # Size based on number of turns
+ color=color_map.get(model_type, '#F5F6F7'),
+ opacity=0.8,
+ line=dict(width=2, color='#01091A')
+ ),
+ hovertemplate="%{text}
" +
+ f"{metric}: %{{x:.3f}}
" +
+ "Cost: $%{y:.3f}
" +
+ "Turns: %{marker.size:.1f}
" +
+ ""
+ ))
+
+ # Add quadrant lines
+ median_x = df_filtered[metric].median()
+ median_y = df_filtered['Avg Total Cost'].median()
+
+ fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
+ fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
+
+ # Add quadrant labels
+ fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance
Low Cost",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)")
+ fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance
High Cost",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
+
+ metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Cost-Performance Efficiency: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"{metric_display}",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Average Session Cost ($)",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=900,
+ width=1450,
+ showlegend=True,
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=1.02,
+ xanchor="right",
+ x=1,
+ font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'),
+ bgcolor='rgba(1, 9, 26, 0.8)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1
+ ),
+ margin=dict(t=100, b=80, l=80, r=80)
+ )
+
+ return fig
+
+
+def create_speed_accuracy_plot(df, metric="Avg AC"):
+ """Create scatter plot showing speed vs accuracy trade-off"""
+ # Filter out models without duration or performance data
+ df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for speed-accuracy analysis")
+
+ # Convert to numeric
+ df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
+ df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
+
+ # Create color scale based on cost
+ df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
+
+ fig = go.Figure()
+
+ # Add scatter trace
+ fig.add_trace(go.Scatter(
+ x=df_filtered[metric],
+ y=df_filtered['Avg Session Duration'],
+ mode='markers+text',
+ text=df_filtered['Model'],
+ textposition="top center",
+ textfont=dict(size=9, color='#94A3B8'),
+ marker=dict(
+ size=12,
+ color=df_filtered['Avg Total Cost'],
+ colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']],
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Cost ($)",
+ font=dict(color="#F5F6F7")
+ ),
+ tickfont=dict(color="#94A3B8"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ x=1.02
+ ),
+ line=dict(width=2, color='#01091A')
+ ),
+ hovertemplate="%{text}
" +
+ f"{metric}: %{{x:.3f}}
" +
+ "Duration: %{y:.1f}s
" +
+ "Cost: $%{marker.color:.3f}
" +
+ ""
+ ))
+
+ # Add quadrant lines
+ median_x = df_filtered[metric].median()
+ median_y = df_filtered['Avg Session Duration'].median()
+
+ fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
+ fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
+
+ # Add quadrant labels
+ fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#F5F6F7", weight=600))
+ fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#ffd21e", weight=600))
+
+ metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Speed vs Accuracy Trade-off: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"{metric_display}",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Average Session Duration (seconds)",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=900,
+ width=1450,
+ margin=dict(t=100, b=80, l=80, r=120)
+ )
+
+ return fig
+
+
+def create_domain_specialization_matrix(df, metric_type="AC"):
+ """Create bubble chart showing domain specialization"""
+ domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
+
+ # Prepare data
+ data = []
+ for _, model in df.iterrows():
+ if model['Model'] == '':
+ continue
+
+ model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
+ if pd.isna(model_avg):
+ continue
+
+ for domain in domains:
+ domain_col = f'{domain} {metric_type}'
+ if domain_col in model and model[domain_col] != '':
+ domain_val = pd.to_numeric(model[domain_col], errors='coerce')
+ if not pd.isna(domain_val):
+ # Calculate specialization strength (deviation from model average)
+ specialization = domain_val - model_avg
+ data.append({
+ 'Model': model['Model'],
+ 'Domain': domain,
+ 'Performance': domain_val,
+ 'Specialization': specialization,
+ 'Model Type': model['Model Type']
+ })
+
+ if not data:
+ return create_empty_chart("No domain specialization data available")
+
+ df_plot = pd.DataFrame(data)
+
+ # Create bubble chart
+ fig = go.Figure()
+
+ # Color based on specialization strength
+ fig.add_trace(go.Scatter(
+ x=df_plot['Domain'],
+ y=df_plot['Model'],
+ mode='markers',
+ marker=dict(
+ size=df_plot['Performance'] * 30, # Size based on absolute performance
+ color=df_plot['Specialization'],
+ colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']],
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Specialization
Strength",
+ font=dict(color="#F5F6F7")
+ ),
+ tickfont=dict(color="#94A3B8"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1
+ ),
+ line=dict(width=2, color='#01091A'),
+ opacity=0.8
+ ),
+ text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}"
+ for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
+ hovertemplate="%{y}
" +
+ "Domain: %{x}
" +
+ "%{text}
" +
+ ""
+ ))
+
+ metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Domain Specialization Matrix: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text="Business Domains",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=13, color="#F5F6F7"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Models",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=11, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=1100,
+ width=1450,
+ margin=dict(t=100, b=80, l=220, r=120)
+ )
+
+ return fig
+
+
+def create_performance_gap_analysis(df, metric_type="AC"):
+ """Create range plot showing performance gaps by domain"""
+ domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
+
+ # Calculate min, max, median for each domain
+ gap_data = []
+ for domain in domains:
+ domain_col = f'{domain} {metric_type}'
+ if domain_col in df.columns:
+ domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
+ if len(domain_values) > 0:
+ gap_data.append({
+ 'Domain': domain,
+ 'Min': domain_values.min(),
+ 'Max': domain_values.max(),
+ 'Median': domain_values.median(),
+ 'Q1': domain_values.quantile(0.25),
+ 'Q3': domain_values.quantile(0.75),
+ 'Gap': domain_values.max() - domain_values.min()
+ })
+
+ if not gap_data:
+ return create_empty_chart("No data available for gap analysis")
+
+ df_gap = pd.DataFrame(gap_data)
+ df_gap = df_gap.sort_values('Gap', ascending=True)
+
+ fig = go.Figure()
+
+ # Add range bars
+ for idx, row in df_gap.iterrows():
+ # Add full range line
+ fig.add_trace(go.Scatter(
+ x=[row['Min'], row['Max']],
+ y=[row['Domain'], row['Domain']],
+ mode='lines',
+ line=dict(color='#64748B', width=2),
+ showlegend=False,
+ hoverinfo='skip'
+ ))
+
+ # Add IQR box
+ fig.add_trace(go.Scatter(
+ x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
+ y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
+ fill='toself',
+ fillcolor='rgba(255, 210, 30, 0.3)',
+ line=dict(color='#ffd21e', width=2),
+ showlegend=False,
+ hoverinfo='skip',
+ mode='lines'
+ ))
+
+ # Add median marker
+ fig.add_trace(go.Scatter(
+ x=[row['Median']],
+ y=[row['Domain']],
+ mode='markers',
+ marker=dict(
+ size=12,
+ color='#ffd21e',
+ symbol='diamond',
+ line=dict(width=2, color='#01091A')
+ ),
+ showlegend=False,
+ hovertemplate=f"{row['Domain']}
" +
+ f"Min: {row['Min']:.3f}
" +
+ f"Q1: {row['Q1']:.3f}
" +
+ f"Median: {row['Median']:.3f}
" +
+ f"Q3: {row['Q3']:.3f}
" +
+ f"Max: {row['Max']:.3f}
" +
+ f"Gap: {row['Gap']:.3f}
" +
+ ""
+ ))
+
+ # Add min/max points
+ for idx, row in df_gap.iterrows():
+ fig.add_trace(go.Scatter(
+ x=[row['Min'], row['Max']],
+ y=[row['Domain'], row['Domain']],
+ mode='markers',
+ marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')),
+ showlegend=False,
+ hoverinfo='skip'
+ ))
+
+ metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"Performance Gap Analysis by Domain: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"{metric_display} Score",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
+ ),
+ yaxis=dict(
+ title=dict(
+ text="Business Domain",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=13, color="#F5F6F7"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=1450,
+ margin=dict(t=100, b=80, l=140, r=80),
+ showlegend=False
+ )
+
+ # Add legend manually
+ fig.add_annotation(
+ text="◆ Median ━ IQR ─ Full Range",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=12, color='#94A3B8'),
+ showarrow=False
+ )
+
+ return fig
+
+
+def create_empty_chart(message):
+ """Create an empty chart with a message"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ text=f"📊 {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="#94A3B8",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=700,
+ width=1450,
+ margin=dict(t=80, b=80, l=80, r=80)
+ )