"""
model_type_lookup = {
"OSS": "Open source",
"API": "Proprietary"
}
def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"):
"""Apply shared filters and sorting to the leaderboard dataframe."""
filtered_df = df.copy()
level_key = resolve_level(level_filter)
highlight_column = None
if model_type_filter != "All":
mapped_type = model_type_lookup.get(model_type_filter, model_type_filter)
filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type]
actual_sort_column = sort_by if sort_by in filtered_df.columns else None
if not actual_sort_column:
if level_key == "ALL":
actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None
else:
actual_sort_column = sr_column_map.get(level_key)
if level_key in sr_column_map:
highlight_column = sr_column_map[level_key]
elif level_key == "ALL" and overall_sort_column in filtered_df.columns:
highlight_column = overall_sort_column
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
return filtered_df, level_key, highlight_column
def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order):
"""Filter and sort the leaderboard data"""
df = load_leaderboard_data()
filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by)
# Generate HTML table
return generate_html_table(filtered_df, highlight_column)
# Load initial data
initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
initial_df = load_leaderboard_data() # Load raw data for model selector
if not initial_df.empty:
overall_success_numeric = pd.to_numeric(initial_df.get('Overall Success'), errors='coerce')
if overall_success_numeric.notna().any():
initial_df = initial_df.assign(**{'Overall Success': overall_success_numeric}).sort_values(
'Overall Success', ascending=False, na_position='last'
)
else:
initial_df = initial_df.sort_values('Model')
initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
initial_level_metric_level = level_ids[0] if level_ids else None
initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else []
initial_level_model_values = initial_level_model_choices[:5]
initial_level_metric_chart = create_level_metric_chart(
initial_df,
initial_level_metric_level,
initial_level_model_values
) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available")
# Load custom CSS and responsive styles
custom_css = get_leaderboard_css() + get_responsive_styles() + """
"""
gr.HTML(custom_css)
# Header styles and navigation
gr.HTML("""
""")
gr.HTML("
")
gr.Image(
value="banner_wide.png",
show_label=False,
interactive=False,
type="filepath",
elem_id="hero-banner"
)
gr.HTML("
")
gr.HTML("""
Hugging Face KREW Ko-AgentBench
한국 실사용 환경 특화 에이전트 벤치마크
""")
# Links section below title
gr.HTML("""
""")
# Section 1: 단계별 태스크 설계
gr.HTML("""
단순 도구 호출부터 장기적 맥락 능력, 강건성 처리 능력까지 에이전트의 능력을 7단계로 입체적으로 분석하였습니다.
단일 턴
80%
- L1: 단일 도구 호출
- L2: 도구 선택
- L3: 도구 순차 추론
- L4: 도구 병렬 추론
- L5: 오류 처리와 강건성
다중 턴
20%
- L6: 효율적인 도구 활용
- L7: 장기 컨텍스트 기억
""")
# Section 2: 핵심 시나리오 구성
gr.HTML("""
네이버, 카카오 등 국내 실사용 API를 기반으로, '약속 예약', '블로그 후기 검색'처럼 일상에 유용한 현실적인 문제 해결 시나리오를 구현했습니다.
⌄
""")
# Section 3: 핵심 평가 기준
gr.HTML("""
핵심 기반 반복 평가
- 실패 API 응답 개선
- '정보 속성 불일치성 변경' 등 기존 벤치마크의 고질적 문제 해결
- 벤치마크의 일관성과 신뢰도 보장
강건성 테스트
- 의도된 오류 상황(상품 단종)의 오류 인식/대응 능력(전략)까지 평가
- 현실 환경에서도 안정적으로 작동하는 모델 선별
단계별 고유 정밀 지표
- 도구 선택, 파라미터 구성, 데이터 흐름 등 문제 해결의 불필요/소요 단계별 평가
- 모델의 강/약점 정량적으로 식별
""")
# Metrics overview cards removed per updated design
# Domain filter section with enhanced styling
gr.HTML("""
""")
level_options = list(level_details.keys())
# Main leaderboard table with dynamic title and integrated controls
leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
# Integrated controls within leaderboard section - stacked vertically
gr.HTML("
태스크 레벨 선택
")
domain_filter = gr.Radio(
choices=level_options,
value=default_level,
label="",
interactive=True,
container=False,
elem_classes=["domain-radio", "inline-radio"]
)
gr.HTML("
🔍 필터 및 정렬
")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("
모델 접근")
model_type_filter = gr.Radio(
choices=["All", "OSS", "API"],
value="All",
label="",
elem_classes=["domain-radio", "inline-radio"],
container=False
)
with gr.Column(scale=1):
gr.HTML("
정렬 순서")
sort_order = gr.Radio(
choices=["Descending", "Ascending"],
value="Descending",
label="",
elem_classes=["domain-radio", "inline-radio"],
container=False
)
leaderboard_table = gr.HTML(initial_table)
# Radar Chart Section
gr.HTML("""
핵심 역량 레이더
6가지 필수 핵심 요소(성공, 실행, 추론, 강건성, 효율성, 호출 유효성)를 추적합니다.
""")
gr.HTML("
비교할 모델을 선택하세요. 최대 5개까지 가능합니다.
")
# gr.HTML("
모델은 최대 5개까지 선택 가능 합니다.
")
model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist()[:10],
value=initial_df['Model'].tolist()[:5],
multiselect=True,
label="",
info=None,
container=False,
)
# Radar chart plot - wrapped in centered container
gr.HTML('
')
radar_chart = gr.Plot(
label="",
value=create_domain_radar_chart(
load_leaderboard_data(),
initial_df['Model'].tolist()[:5]
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('
')
gr.HTML("
")
# Define generate_performance_card function before using it
def generate_performance_card(model_name):
"""Generate HTML for the model performance card"""
if not model_name:
return """
Please select a model to generate its performance card
"""
# Get model data
df = load_leaderboard_data()
model_data = df[df['Model'] == model_name]
if model_data.empty:
return """
Model not found in the database
"""
row = model_data.iloc[0]
# Get overall rank based on overall success
df_with_success = df.copy()
df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
try:
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
except:
rank = 'N/A'
# Format values
def format_value(val, decimals=3, prefix='', suffix=''):
if pd.isna(val) or val == '':
return 'N/A'
return f"{prefix}{float(val):.{decimals}f}{suffix}"
def format_score(value):
if pd.isna(value) or value == '':
return 'N/A'
return f"{float(value):.3f}"
radar_metrics = [
("기초 수행력", row.get('Execution Accuracy')),
("복합 추론력", row.get('Complex Reasoning')),
("견고성", row.get('Robustness')),
("맥락 효율성", row.get('Context & Efficiency')),
("전반적 성공률", row.get('Overall Success')),
("기본적 유효성", row.get('Call Validity')),
]
radar_values = []
radar_labels = []
for label, value in radar_metrics:
if pd.isna(value) or value == '':
radar_values.append(0.0)
else:
try:
radar_values.append(max(0.0, min(1.0, float(value))))
except (TypeError, ValueError):
radar_values.append(0.0)
radar_labels.append(label)
mini_radar_html = build_static_radar_chart(radar_values, radar_labels)
level_blocks = []
for level in level_ids:
sr_col = sr_column_map.get(level)
level_blocks.append((level, row.get(sr_col, '')))
evaluation_date = EVALUATION_DATE
icon_html = ""
if KREW_ICON_BASE64:
icon_html = f'

'
else:
icon_html = '
🤖
'
card_html = f"""
"""
return card_html
# MODEL PERFORMANCE CARD SECTION
gr.HTML("""
모델 성능 카드
모델의 성능 스펙트럼을 6대 핵심 지표와 L1~L7 단계별 종합 성공률(SR)로 시각화한 정밀 분석 카드를 확인해보세요.
※ Rank는 L1~L7 단계별 SR의 평균값을 기준으로 선정되었습니다.
""")
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
gr.HTML("""
분석 카드를 생성할 모델을 선택하세요.
""")
card_model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist(),
value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
)
download_card_btn = gr.Button(
"PNG로 다운로드",
elem_id="download-card-btn",
elem_classes=["pill-button"]
)
gr.HTML("""
""")
# Card display area - generate initial card
initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
initial_card_html = generate_performance_card(initial_model) if initial_model else ""
card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html")
gr.HTML("""
""")
# Level metric breakdown section
gr.HTML("""
레벨별 상세 지표
각 Ko-AgentBench 단계별 고유 평가 지표를 통해 모델 점수를 비교하고 더 자세히 살펴보세요.
""")
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
level_metric_selector = gr.Dropdown(
choices=level_ids,
value=level_ids[0] if level_ids else None,
multiselect=False,
label="",
info=None,
container=False,
elem_classes=["level-dropdown"]
)
level_model_selector = gr.Dropdown(
choices=initial_level_model_choices,
value=initial_level_model_values,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
)
gr.HTML('
')
level_metric_chart = gr.Plot(
label="",
value=initial_level_metric_chart,
elem_classes=["level-metric-plot", "plot-container"]
)
gr.HTML("""
""")
# # Heatmap section
# gr.HTML("""
#
#
#
종합 성능 히트맵
#
각 모델의 L1~L7 Ko-AgentBench SR(성공률) 점수를 한눈에 보세요.
#
#
# """)
# heatmap_chart = gr.Plot(
# label="",
# value=initial_heatmap,
# elem_classes=["heatmap-plot", "plot-container"]
# )
# gr.HTML("""
#
#
# """)
# Update functions
def get_optimal_sort_order(sort_by_value):
"""Return the optimal sort order for a given metric"""
# Metrics where higher is better (descending)
descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
# Metrics where lower is better (ascending)
ascending_metrics = []
if sort_by_value in descending_metrics:
return "Descending"
elif sort_by_value in ascending_metrics:
return "Ascending"
else:
return "Descending" # Default fallback
def update_table(level_filter, model_type_filter, sort_order):
title_html = update_leaderboard_title(level_filter)
sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
return title_html, table_html
def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
# Update model selector choices based on filtered data
available_models_all = filtered_df['Model'].tolist()
available_models = available_models_all[:15] # Top 15 from filtered results
# If selected models are not in available models, reset to top 5
if selected_models:
valid_selected = [m for m in selected_models if m in available_models]
# Check if more than 5 models are selected and show alert
if len(valid_selected) > 5:
gr.Warning("최대 5개 까지만 선택 가능합니다")
# Remove the last selected item (6th item) instead of keeping first 5
valid_selected = valid_selected[:-1]
if not valid_selected:
valid_selected = available_models[:5]
else:
valid_selected = available_models[:5]
# Create radar chart
chart = create_domain_radar_chart(filtered_df, valid_selected)
# Prepare heatmap order prioritizing selected models
# Level metric chart
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
),
chart,
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models_all = filtered_df['Model'].tolist()
if selected_models:
valid_selected = [m for m in selected_models if m in available_models_all]
# Check if more than 5 models are selected and show alert
if len(valid_selected) > 5:
# JavaScript alert for exceeding 5 models
gr.Warning("최대 5개 까지만 선택 가능합니다")
# Remove the last selected item (6th item) instead of keeping first 5
valid_selected = valid_selected[:-1]
if not valid_selected:
valid_selected = available_models_all[:5]
else:
valid_selected = available_models_all[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models_all[:15],
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
),
create_domain_radar_chart(filtered_df, valid_selected),
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models = filtered_df['Model'].tolist()
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_models]
# Check if more than 5 models are selected and show alert
if len(valid_level_models) > 5:
gr.Warning("최대 5개 까지만 선택 가능합니다")
# Remove the last selected item (6th item) instead of keeping first 5
valid_level_models = valid_level_models[:-1]
if not valid_level_models:
valid_level_models = available_models[:5]
else:
valid_level_models = available_models[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_chart,
)
# Update table when filters change
filter_inputs = [domain_filter, model_type_filter, sort_order]
for input_component in filter_inputs:
input_component.change(
fn=update_table,
inputs=filter_inputs,
outputs=[leaderboard_title, leaderboard_table]
)
# Also update radar chart when filters change
input_component.change(
fn=update_radar_chart,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
)
# Update radar chart when model selection changes
model_selector.change(
fn=update_radar_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
)
level_metric_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
level_model_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
# Add custom CSS for the performance card
gr.HTML("""
""")
# Wire up the card generator to selection change
card_model_selector.change(
fn=generate_performance_card,
inputs=[card_model_selector],
outputs=[card_display]
)
# Wire up download button with html2canvas capture
download_card_btn.click(
fn=None,
js="""
async () => {
const ensureHtml2Canvas = () => new Promise((resolve, reject) => {
if (window.html2canvas) {
resolve(window.html2canvas);
return;
}
const existing = document.querySelector('script[data-html2canvas]');
if (existing) {
existing.addEventListener('load', () => resolve(window.html2canvas));
existing.addEventListener('error', reject);
return;
}
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js';
script.async = true;
script.dataset.html2canvas = 'true';
script.onload = () => resolve(window.html2canvas);
script.onerror = () => reject(new Error('Failed to load html2canvas'));
document.head.appendChild(script);
});
const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms));
await pause(60);
const card = document.querySelector('.performance-card');
if (!card) {
alert('Performance card not found. Please select a model first.');
return;
}
const btn = document.getElementById('download-card-btn');
const originalText = btn?.textContent || '';
if (btn) {
btn.textContent = 'Generating...';
btn.disabled = true;
}
try {
const html2canvasLib = await ensureHtml2Canvas();
if (!html2canvasLib) {
throw new Error('html2canvas unavailable');
}
const canvas = await html2canvasLib(card, {
backgroundColor: '#01091A',
scale: 2,
logging: false,
useCORS: true
});
const link = document.createElement('a');
const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
const timestamp = new Date().toISOString().slice(0, 10);
const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
link.download = fileName;
link.href = canvas.toDataURL('image/png');
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
} catch (error) {
console.error('Error capturing card:', error);
alert('Failed to capture performance card. Please try again.');
} finally {
if (btn) {
btn.textContent = originalText;
btn.disabled = false;
}
}
}
"""
)
# Also update card when filters change to keep model selector in sync
for input_component in filter_inputs:
def update_dropdown_and_card(*args):
filtered_df, _, _ = apply_filters(
load_leaderboard_data(),
args[0],
args[1],
args[2],
"Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success")
)
choices = filtered_df['Model'].tolist()
# Select first model from filtered list
value = choices[0] if choices else None
return gr.Dropdown(
choices=choices,
value=value,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
)
input_component.change(
fn=update_dropdown_and_card,
inputs=filter_inputs,
outputs=[card_model_selector]
)
return leaderboard_table
def create_leaderboard_v2_interface():
"""Create the complete leaderboard v1 interface"""
return create_leaderboard_v2_tab()
def create_domain_radar_chart(df, selected_models=None, max_models=5):
"""Visualize six core capability metrics on a radar chart."""
df = df.copy()
metrics_info = [
{"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
{"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
{"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
{"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
{"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
{"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
]
required_columns = [m["column"] for m in metrics_info]
if df.empty or not any(col in df.columns for col in required_columns):
return create_empty_radar_chart("Not enough data to build the capability radar")
# Default model selection
if not selected_models:
if "Overall Success" in df.columns:
top_models = df.sort_values("Overall Success", ascending=False)
else:
top_models = df
selected_models = top_models['Model'].head(max_models).tolist()
selected_models = selected_models[:max_models]
# Ensure metric columns are numeric
for metric in metrics_info:
col = metric["column"]
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
fig = go.Figure()
angle_labels = [m["label"] for m in metrics_info]
palette = [
{'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
{'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
{'fill': 'rgba(161, 98, 7, 0.22)', 'line': '#A16207'},
{'fill': 'rgba(220, 38, 38, 0.20)', 'line': '#DC2626'},
{'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
]
for idx, model_name in enumerate(selected_models):
model_data = df[df['Model'] == model_name]
if model_data.empty:
continue
row = model_data.iloc[0]
values = []
tooltips = []
for metric in metrics_info:
col = metric["column"]
value = row[col] if col in row else float('nan')
if pd.isna(value) or value == '':
value = 0
values.append(float(value))
tooltips.append(metric["description"])
if not values:
continue
values_loop = values + [values[0]]
angles_loop = angle_labels + [angle_labels[0]]
tooltips_loop = tooltips + [tooltips[0]]
colors = palette[idx % len(palette)]
fig.add_trace(
go.Scatterpolar(
r=values_loop,
theta=angles_loop,
fill='toself',
fillcolor=colors['fill'],
line=dict(color=colors['line'], width=3),
marker=dict(
size=10,
color=colors['line'],
symbol='circle',
line=dict(width=2, color='#01091A')
),
name=model_name,
customdata=tooltips_loop,
mode="lines+markers",
hovertemplate="
%{fullData.name}" +
"
%{theta}" +
"
%{customdata}" +
"
%{r:.3f}" +
"
",
hoverlabel=dict(
bgcolor="rgba(1, 9, 26, 0.95)",
bordercolor=colors['line'],
font=dict(color="white", size=12, family="'Geist', sans-serif")
)
)
)
tick_vals = [i / 5 for i in range(6)]
tick_text = [f"{val:.2f}" for val in tick_vals]
fig.update_layout(
polar=dict(
bgcolor='rgba(245, 246, 247, 0.03)',
radialaxis=dict(
visible=True,
range=[0, 1],
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.1)',
gridwidth=1,
tickvals=tick_vals,
ticktext=tick_text,
tickfont=dict(
size=11,
color='white',
family="'Geist Mono', monospace"
)
),
angularaxis=dict(
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.08)',
tickfont=dict(
size=13,
family="'Geist', sans-serif",
color='white',
weight=600
),
rotation=90,
direction="clockwise",
),
),
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.15,
xanchor="center",
x=0.5,
font=dict(size=12, family="'Geist', sans-serif", color='white'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
itemsizing='constant',
itemwidth=30
),
title=dict(
text="
Core Capability Radar",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="white",
weight=700
),
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=900,
margin=dict(t=30, b=50, l=10, r=10),
autosize=True,
annotations=[]
)
return fig
def create_performance_heatmap(df, ordered_models=None, max_models=12):
"""Render a heatmap of SR scores across task levels for selected models."""
df = df.copy()
level_sequence = [f"L{i}" for i in range(1, 8)]
sr_columns = []
for level in level_sequence:
col = f"{level}_SR"
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
sr_columns.append((level, col))
if df.empty or not sr_columns:
return create_empty_heatmap("Not enough SR data to render the heatmap")
df = df.drop_duplicates(subset=["Model"])
if df.empty:
return create_empty_heatmap("No models available to render the heatmap")
sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0]
df = df.sort_values(sort_column, ascending=False)
if ordered_models:
ordered_models = [m for m in ordered_models if m in df["Model"].tolist()]
else:
ordered_models = df["Model"].tolist()
if not ordered_models:
return create_empty_heatmap("No models available to render the heatmap")
ordered_models = ordered_models[:max_models]
heatmap_df = df.set_index("Model").reindex(ordered_models)
level_labels = []
z_matrix = []
has_values = False
for level, col in sr_columns:
if col not in heatmap_df.columns:
continue
label = f"{level} · SR"
level_labels.append(label)
row_values = []
for model in ordered_models:
value = heatmap_df.at[model, col] if model in heatmap_df.index else None
if pd.isna(value):
row_values.append(None)
else:
val = float(value)
row_values.append(val)
has_values = True
z_matrix.append(row_values)
if not level_labels or not has_values:
return create_empty_heatmap("Not enough SR data to render the heatmap")
colorscale = [
[0.0, "#0A0A0A"],
[0.25, "#1A1411"],
[0.5, "#332818"],
[0.75, "#B8660A"],
[1.0, "#FFD21E"],
]
fig = go.Figure()
fig.add_trace(
go.Heatmap(
z=z_matrix,
x=ordered_models,
y=level_labels,
colorscale=colorscale,
zmin=0,
zmax=1,
hovertemplate="
%{y}%{x}SR · %{z:.3f}
",
colorbar=dict(
title="Success Rate",
titlefont=dict(color="white", family="'Geist', sans-serif", size=12),
tickfont=dict(color="white", family="'Geist', sans-serif", size=10),
thickness=12,
len=0.7,
outlinecolor="rgba(255, 255, 255, 0.1)",
bgcolor="rgba(1, 9, 26, 0.75)"
),
showscale=True
)
)
annotations = []
for y_idx, level in enumerate(level_labels):
for x_idx, model in enumerate(ordered_models):
value = z_matrix[y_idx][x_idx]
if value is None:
continue
font_color = "#0B1120" if value >= 0.6 else "#F8FAFC"
annotations.append(
dict(
x=model,
y=level,
text=f"{value:.3f}",
showarrow=False,
font=dict(
family="'Geist Mono', monospace",
size=11,
color=font_color
)
)
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
margin=dict(t=80, b=90, l=110, r=160),
height=520,
width=1450,
font=dict(family="'Geist', sans-serif", color="white"),
xaxis=dict(
tickangle=-25,
showgrid=False,
ticks="",
tickfont=dict(size=11, family="'Geist', sans-serif", color="white")
),
yaxis=dict(
showgrid=False,
ticks="",
tickfont=dict(size=12, family="'Geist', sans-serif", color="white")
),
annotations=annotations,
title=dict(
text="
Comprehensive Performance Heatmap",
x=0.5,
y=0.98,
font=dict(
size=20,
family="'Geist', sans-serif",
color="white",
weight=700
),
)
)
fig.update_xaxes(side="bottom")
return fig
def create_empty_heatmap(message):
"""Render an empty state for the heatmap with a centered message."""
fig = go.Figure()
fig.add_annotation(
text=f"🗺️ {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=520,
width=1450,
margin=dict(t=80, b=80, l=80, r=160),
title=dict(
text="
Comprehensive Performance Heatmap",
x=0.5,
y=0.98,
font=dict(
size=20,
family="'Geist', sans-serif",
color="white",
weight=700
),
)
)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
return fig
def create_level_metric_chart(df, level, selected_models=None, max_models=5):
"""Render a grouped horizontal bar chart showing per-model scores for a level's metrics."""
if not level:
return create_empty_level_metric_chart("Select a level to view its metrics")
df = df.copy()
level_prefix = f"{level}_"
level_columns = [col for col in df.columns if col.startswith(level_prefix)]
metric_columns = []
for col in level_columns:
metric_suffix = col[len(level_prefix):]
metric_key_lower = metric_suffix.lower()
if "cost" in metric_key_lower:
continue
numeric_series = pd.to_numeric(df[col], errors='coerce')
valid_values = numeric_series.dropna()
if valid_values.empty:
continue
if (valid_values < 0).any() or (valid_values > 1.05).any():
continue
df[col] = numeric_series
metric_columns.append(col)
if not metric_columns:
return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize")
df = df.drop_duplicates(subset=['Model'])
if df.empty:
return create_empty_level_metric_chart("No models available to render level metrics")
if selected_models:
model_order = [m for m in selected_models if m in df['Model'].tolist()]
else:
sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0]
model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist()
if not model_order:
model_order = df['Model'].tolist()
model_order = model_order[:max_models]
df_models = df[df['Model'].isin(model_order)].set_index('Model')
if df_models.empty:
return create_empty_level_metric_chart("No matching models for selected filters")
def prettify_metric_name(metric_key):
raw = metric_key[len(level_prefix):]
text = raw.replace('_', ' ')
text = re.sub(r'(?<=.)([A-Z])', r' \1', text)
text = text.replace('Avg', 'Average')
replacements = {
'Sr': 'SR',
'Ac': 'AC',
'Tsq': 'TSQ',
'Cvr': 'CVR',
'Psm': 'PSM',
'Prov': 'Prov',
'Call Em': 'CallEM',
'Reuse Rate': 'Reuse Rate',
'Eff Score': 'Eff Score'
}
words = text.title().split()
words = [replacements.get(word, word) for word in words]
return ' '.join(words)
metric_labels = []
for col in metric_columns:
label = prettify_metric_name(col)
if label in metric_labels:
suffix = 2
while f"{label} ({suffix})" in metric_labels:
suffix += 1
label = f"{label} ({suffix})"
metric_labels.append(label)
model_palette = [
'#ffd21e',
'#FF8A3C',
'#A16207',
'#DC2626',
'#F8FAFC',
'#38BDF8',
]
fig = go.Figure()
max_value = 0
for idx, model in enumerate(model_order):
values = []
for col in metric_columns:
value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan')
if pd.notna(value):
values.append(float(value))
max_value = max(max_value, float(value))
else:
values.append(None)
color = model_palette[idx % len(model_palette)]
fig.add_trace(
go.Bar(
name=model,
y=metric_labels,
x=values,
orientation='h',
marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)),
hovertemplate="
%{y}Model ·
%{fullData.name}Score · %{x:.3f}
",
)
)
plot_height = max(360, 140 + 48 * len(metric_labels))
if max_value <= 0:
x_range = [0, 1]
else:
x_range = [0, max_value * 1.05]
fig.update_layout(
barmode='group',
bargap=0.25,
bargroupgap=0.18,
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=plot_height,
autosize=True,
margin=dict(t=90, b=80, l=220, r=160),
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
bgcolor='rgba(1, 9, 26, 0.75)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
font=dict(size=11, family="'Geist', sans-serif", color='white')
),
xaxis=dict(
title=dict(text=f"
{level} Metric Score", font=dict(size=14, color="white")),
tickfont=dict(size=11, color="white"),
gridcolor='rgba(245, 246, 247, 0.08)',
zerolinecolor='rgba(245, 246, 247, 0.18)',
range=x_range
),
yaxis=dict(
tickfont=dict(size=13, color="white"),
automargin=True
),
title=dict(
text=f"
{level} Metric Breakdown",
x=0.5,
y=0.98,
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
)
)
return fig
def create_empty_level_metric_chart(message):
fig = go.Figure()
fig.add_annotation(
text=f"🧭 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(size=18, color="white", family="'Geist', sans-serif"),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=420,
autosize=True,
margin=dict(t=80, b=60, l=80, r=120),
title=dict(
text="
Level Metric Breakdown",
x=0.5,
y=0.98,
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
)
)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
return fig
def create_empty_radar_chart(message):
"""Create an empty radar chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"📊 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1450,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
title=dict(
text="
Core Capability Radar",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="white",
weight=700
),
),
annotations=[
dict(
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
# NEW VISUALIZATION FUNCTIONS
def create_cost_performance_scatter(df, metric="Avg AC"):
"""Create scatter plot showing cost vs performance efficiency"""
# Filter out models without cost or performance data
df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
label_map = {
'Proprietary': 'API',
'Open source': 'OSS'
}
if df_filtered.empty:
return create_empty_chart("No data available for cost-performance analysis")
# Convert to numeric
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
# Create color mapping for model type
color_map = {
'Proprietary': '#1098F7', # Airglow Blue for Proprietary
'Open source': '#58BC82' # Green for Open source
}
df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
fig = go.Figure()
# Add scatter points
for model_type in df_filtered['Model Type'].unique():
df_type = df_filtered[df_filtered['Model Type'] == model_type]
legend_name = label_map.get(model_type, model_type)
fig.add_trace(go.Scatter(
x=df_type[metric],
y=df_type['Avg Total Cost'],
mode='markers+text',
name=legend_name,
text=df_type['Model'],
textposition="top center",
textfont=dict(size=10, color='white'),
marker=dict(
size=df_type['Avg Turns'] * 3, # Size based on number of turns
color=color_map.get(model_type, '#F5F6F7'),
opacity=0.8,
line=dict(width=2, color='#01091A')
),
hovertemplate="
%{text}" +
f"{metric}: %{{x:.3f}}
" +
"Cost: $%{y:.3f}
" +
"Turns: %{marker.size:.1f}
" +
"
"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Total Cost'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance
Low Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)")
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance
High Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Cost-Performance Efficiency: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"
{metric_display}",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="
Average Session Cost ($)",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
font=dict(size=12, family="'Geist', sans-serif", color='white'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1
),
margin=dict(t=100, b=80, l=80, r=80)
)
return fig
def create_speed_accuracy_plot(df, metric="Avg AC"):
"""Create scatter plot showing speed vs accuracy trade-off"""
# Filter out models without duration or performance data
df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for speed-accuracy analysis")
# Convert to numeric
df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
# Create color scale based on cost
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
fig = go.Figure()
# Add scatter trace
fig.add_trace(go.Scatter(
x=df_filtered[metric],
y=df_filtered['Avg Session Duration'],
mode='markers+text',
text=df_filtered['Model'],
textposition="top center",
textfont=dict(size=9, color='white'),
marker=dict(
size=12,
color=df_filtered['Avg Total Cost'],
colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']],
showscale=True,
colorbar=dict(
title=dict(
text="Cost ($)",
font=dict(color="white")
),
tickfont=dict(color="white"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
x=1.02
),
line=dict(width=2, color='#01091A')
),
hovertemplate="
%{text}" +
f"{metric}: %{{x:.3f}}
" +
"Duration: %{y:.1f}s
" +
"Cost: $%{marker.color:.3f}
" +
"
"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Session Duration'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="white", weight=600))
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#ffd21e", weight=600))
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Speed vs Accuracy Trade-off: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"
{metric_display}",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="
Average Session Duration (seconds)",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=120)
)
return fig
def create_domain_specialization_matrix(df, metric_type="AC"):
"""Create bubble chart showing domain specialization"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Prepare data
data = []
for _, model in df.iterrows():
if model['Model'] == '':
continue
model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
if pd.isna(model_avg):
continue
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in model and model[domain_col] != '':
domain_val = pd.to_numeric(model[domain_col], errors='coerce')
if not pd.isna(domain_val):
# Calculate specialization strength (deviation from model average)
specialization = domain_val - model_avg
data.append({
'Model': model['Model'],
'Domain': domain,
'Performance': domain_val,
'Specialization': specialization,
'Model Type': model['Model Type']
})
if not data:
return create_empty_chart("No domain specialization data available")
df_plot = pd.DataFrame(data)
# Create bubble chart
fig = go.Figure()
# Color based on specialization strength
fig.add_trace(go.Scatter(
x=df_plot['Domain'],
y=df_plot['Model'],
mode='markers',
marker=dict(
size=df_plot['Performance'] * 30, # Size based on absolute performance
color=df_plot['Specialization'],
colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']],
showscale=True,
colorbar=dict(
title=dict(
text="Specialization
Strength",
font=dict(color="white")
),
tickfont=dict(color="white"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
),
line=dict(width=2, color='#01091A'),
opacity=0.8
),
text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}"
for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
hovertemplate="
%{y}" +
"Domain: %{x}
" +
"%{text}
" +
"
"
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Domain Specialization Matrix: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text="
Business Domains",
font=dict(size=16, color="white")
),
tickfont=dict(size=13, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
yaxis=dict(
title=dict(
text="
Models",
font=dict(size=16, color="white")
),
tickfont=dict(size=11, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1100,
width=1450,
margin=dict(t=100, b=80, l=220, r=120)
)
return fig
def create_performance_gap_analysis(df, metric_type="AC"):
"""Create range plot showing performance gaps by domain"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Calculate min, max, median for each domain
gap_data = []
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in df.columns:
domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
if len(domain_values) > 0:
gap_data.append({
'Domain': domain,
'Min': domain_values.min(),
'Max': domain_values.max(),
'Median': domain_values.median(),
'Q1': domain_values.quantile(0.25),
'Q3': domain_values.quantile(0.75),
'Gap': domain_values.max() - domain_values.min()
})
if not gap_data:
return create_empty_chart("No data available for gap analysis")
df_gap = pd.DataFrame(gap_data)
df_gap = df_gap.sort_values('Gap', ascending=True)
fig = go.Figure()
# Add range bars
for idx, row in df_gap.iterrows():
# Add full range line
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='lines',
line=dict(color='#64748B', width=2),
showlegend=False,
hoverinfo='skip'
))
# Add IQR box
fig.add_trace(go.Scatter(
x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
fill='toself',
fillcolor='rgba(255, 210, 30, 0.3)',
line=dict(color='#ffd21e', width=2),
showlegend=False,
hoverinfo='skip',
mode='lines'
))
# Add median marker
fig.add_trace(go.Scatter(
x=[row['Median']],
y=[row['Domain']],
mode='markers',
marker=dict(
size=12,
color='#ffd21e',
symbol='diamond',
line=dict(width=2, color='#01091A')
),
showlegend=False,
hovertemplate=f"
{row['Domain']}" +
f"Min: {row['Min']:.3f}
" +
f"Q1: {row['Q1']:.3f}
" +
f"Median: {row['Median']:.3f}
" +
f"Q3: {row['Q3']:.3f}
" +
f"Max: {row['Max']:.3f}
" +
f"Gap: {row['Gap']:.3f}
" +
"
"
))
# Add min/max points
for idx, row in df_gap.iterrows():
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='markers',
marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')),
showlegend=False,
hoverinfo='skip'
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Performance Gap Analysis by Domain: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"
{metric_display} Score",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
),
yaxis=dict(
title=dict(
text="
Business Domain",
font=dict(size=16, color="white")
),
tickfont=dict(size=13, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=1450,
margin=dict(t=100, b=80, l=140, r=80),
showlegend=False
)
# Add legend manually
fig.add_annotation(
text="◆ Median ━ IQR ─ Full Range",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=12, color='white'),
showarrow=False
)
return fig
def create_empty_chart(message):
"""Create an empty chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"📊 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1450,
margin=dict(t=80, b=80, l=80, r=80)
)