Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Harheem Kim
commited on
Commit
·
06d5e2e
1
Parent(s):
71ade2b
set white font color
Browse files- components/leaderboard_components.py +36 -36
- styles/leaderboard_styles.py +36 -5
- tabs/leaderboard_v1.py +107 -107
components/leaderboard_components.py
CHANGED
|
@@ -8,7 +8,7 @@ def get_chart_colors():
|
|
| 8 |
"Private": "#1098F7", # Airglow Blue for Proprietary
|
| 9 |
"Open source": "#58BC82", # Green for Open source
|
| 10 |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
|
| 11 |
-
"text": "
|
| 12 |
"background": "#01091A",
|
| 13 |
"grid": (0, 0, 0, 0.1), # RGBA tuple for grid
|
| 14 |
}
|
|
@@ -47,7 +47,7 @@ def get_rank_badge(rank):
|
|
| 47 |
align-items: center;
|
| 48 |
justify-content: center;
|
| 49 |
min-width: 28px;
|
| 50 |
-
color:
|
| 51 |
font-weight: 500;
|
| 52 |
">
|
| 53 |
{rank}
|
|
@@ -134,7 +134,7 @@ def get_score_bar(score):
|
|
| 134 |
<span style="
|
| 135 |
font-family: 'SF Mono', monospace;
|
| 136 |
font-weight: 600;
|
| 137 |
-
color:
|
| 138 |
min-width: 60px;
|
| 139 |
">{score:.3f}</span>
|
| 140 |
</div>
|
|
@@ -323,8 +323,8 @@ def get_faq_section():
|
|
| 323 |
return """
|
| 324 |
<div class="dark-container" style="margin-top: 40px; margin-bottom: 40px;">
|
| 325 |
<div class="section-header">
|
| 326 |
-
<span class="section-icon" style="color:
|
| 327 |
-
<h3 style="margin: 0; color:
|
| 328 |
Frequently Asked Questions
|
| 329 |
</h3>
|
| 330 |
</div>
|
|
@@ -332,76 +332,76 @@ def get_faq_section():
|
|
| 332 |
<div style="margin-top: 24px;">
|
| 333 |
<!-- FAQ Item 1 -->
|
| 334 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 335 |
-
<summary style="cursor: pointer; font-weight: 600; color:
|
| 336 |
-
<span style="color:
|
| 337 |
</summary>
|
| 338 |
-
<div style="margin-top: 12px; padding-left: 28px; color:
|
| 339 |
-
<strong style="color:
|
| 340 |
</div>
|
| 341 |
</details>
|
| 342 |
|
| 343 |
<!-- FAQ Item 2 -->
|
| 344 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 345 |
-
<summary style="cursor: pointer; font-weight: 600; color:
|
| 346 |
-
<span style="color:
|
| 347 |
</summary>
|
| 348 |
-
<div style="margin-top: 12px; padding-left: 28px; color:
|
| 349 |
-
<strong style="color:
|
| 350 |
</div>
|
| 351 |
</details>
|
| 352 |
|
| 353 |
<!-- FAQ Item 3 -->
|
| 354 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 355 |
-
<summary style="cursor: pointer; font-weight: 600; color:
|
| 356 |
-
<span style="color:
|
| 357 |
</summary>
|
| 358 |
-
<div style="margin-top: 12px; padding-left: 28px; color:
|
| 359 |
-
<strong style="color:
|
| 360 |
</div>
|
| 361 |
</details>
|
| 362 |
|
| 363 |
<!-- FAQ Item 4 -->
|
| 364 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 365 |
-
<summary style="cursor: pointer; font-weight: 600; color:
|
| 366 |
-
<span style="color:
|
| 367 |
</summary>
|
| 368 |
-
<div style="margin-top: 12px; padding-left: 28px; color:
|
| 369 |
-
<strong style="color:
|
| 370 |
</div>
|
| 371 |
</details>
|
| 372 |
|
| 373 |
<!-- About Metrics -->
|
| 374 |
<div style="margin-top: 32px; padding: 20px; background: #ffd21e0d; border-radius: 12px; border: 1px solid var(--border-default);">
|
| 375 |
-
<h4 style="color:
|
| 376 |
<span style="font-size: 1.3rem;">📊</span>
|
| 377 |
Understanding the Metrics
|
| 378 |
</h4>
|
| 379 |
|
| 380 |
<div style="display: grid; gap: 16px;">
|
| 381 |
<div>
|
| 382 |
-
<h5 style="color:
|
| 383 |
-
<p style="color:
|
| 384 |
A score from 0 to 1 measuring how successfully the agent completes the user's requested tasks. This evaluates whether the agent achieves the intended goals, follows instructions accurately, and provides complete solutions. Higher scores indicate better task completion.
|
| 385 |
</p>
|
| 386 |
</div>
|
| 387 |
|
| 388 |
<div>
|
| 389 |
-
<h5 style="color:
|
| 390 |
-
<p style="color:
|
| 391 |
A score from 0 to 1 evaluating how well the agent selects and uses the appropriate tools for each task. This includes choosing the right tool, using correct parameters, and proper sequencing of tool calls. Higher scores indicate better tool utilization.
|
| 392 |
</p>
|
| 393 |
</div>
|
| 394 |
|
| 395 |
<div>
|
| 396 |
-
<h5 style="color:
|
| 397 |
-
<p style="color:
|
| 398 |
Models are tested across five business domains: Banking, Healthcare, Insurance, Investment, and Telecom. Each domain has specific scenarios and requirements that test the agent's ability to handle industry-specific tasks and terminology.
|
| 399 |
</p>
|
| 400 |
</div>
|
| 401 |
|
| 402 |
<div>
|
| 403 |
-
<h5 style="color:
|
| 404 |
-
<p style="color:
|
| 405 |
• <strong>Cost:</strong> Total API cost per session in USD<br>
|
| 406 |
• <strong>Duration:</strong> Time to complete tasks in seconds<br>
|
| 407 |
• <strong>Turns:</strong> Number of exchanges to reach resolution<br>
|
|
@@ -411,13 +411,13 @@ def get_faq_section():
|
|
| 411 |
</div>
|
| 412 |
|
| 413 |
<div style="margin-top: 20px; padding-top: 16px; border-top: 1px solid var(--border-subtle);">
|
| 414 |
-
<p style="color:
|
| 415 |
<strong>Learn More:</strong> For detailed methodology and evaluation criteria, visit the
|
| 416 |
-
<a href="https://galileo.ai/blog/agent-leaderboard-v2" target="_blank" style="color:
|
| 417 |
official blog post ↗
|
| 418 |
</a>
|
| 419 |
or explore the
|
| 420 |
-
<a href="https://github.com/rungalileo/agent-leaderboard" target="_blank" style="color:
|
| 421 |
GitHub repository ↗
|
| 422 |
</a>
|
| 423 |
</p>
|
|
@@ -431,7 +431,7 @@ def get_faq_section():
|
|
| 431 |
}
|
| 432 |
|
| 433 |
.faq-item:hover {
|
| 434 |
-
border-color:
|
| 435 |
box-shadow: 0 4px 12px rgba(255, 210, 30, 0.1);
|
| 436 |
}
|
| 437 |
|
|
@@ -444,7 +444,7 @@ def get_faq_section():
|
|
| 444 |
display: inline-block;
|
| 445 |
margin-right: 8px;
|
| 446 |
transition: transform 0.3s ease;
|
| 447 |
-
color:
|
| 448 |
}
|
| 449 |
|
| 450 |
.faq-item[open] summary::before {
|
|
@@ -452,7 +452,7 @@ def get_faq_section():
|
|
| 452 |
}
|
| 453 |
|
| 454 |
.faq-item summary:hover {
|
| 455 |
-
color:
|
| 456 |
}
|
| 457 |
</style>
|
| 458 |
</div>
|
|
|
|
| 8 |
"Private": "#1098F7", # Airglow Blue for Proprietary
|
| 9 |
"Open source": "#58BC82", # Green for Open source
|
| 10 |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
|
| 11 |
+
"text": "white",
|
| 12 |
"background": "#01091A",
|
| 13 |
"grid": (0, 0, 0, 0.1), # RGBA tuple for grid
|
| 14 |
}
|
|
|
|
| 47 |
align-items: center;
|
| 48 |
justify-content: center;
|
| 49 |
min-width: 28px;
|
| 50 |
+
color: white;
|
| 51 |
font-weight: 500;
|
| 52 |
">
|
| 53 |
{rank}
|
|
|
|
| 134 |
<span style="
|
| 135 |
font-family: 'SF Mono', monospace;
|
| 136 |
font-weight: 600;
|
| 137 |
+
color: white;
|
| 138 |
min-width: 60px;
|
| 139 |
">{score:.3f}</span>
|
| 140 |
</div>
|
|
|
|
| 323 |
return """
|
| 324 |
<div class="dark-container" style="margin-top: 40px; margin-bottom: 40px;">
|
| 325 |
<div class="section-header">
|
| 326 |
+
<span class="section-icon" style="color: white;">❓</span>
|
| 327 |
+
<h3 style="margin: 0; color: white; font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
|
| 328 |
Frequently Asked Questions
|
| 329 |
</h3>
|
| 330 |
</div>
|
|
|
|
| 332 |
<div style="margin-top: 24px;">
|
| 333 |
<!-- FAQ Item 1 -->
|
| 334 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 335 |
+
<summary style="cursor: pointer; font-weight: 600; color: white; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
|
| 336 |
+
<span style="color: white;"></span> Does the methodology favor GPT-4.1 since it uses GPT-4.1 to simulate users and tools, so GPT-4.1 ranks itself highest.
|
| 337 |
</summary>
|
| 338 |
+
<div style="margin-top: 12px; padding-left: 28px; color: white; line-height: 1.6;">
|
| 339 |
+
<strong style="color: white;"></strong> GPT's top ranking isn't due to simulator bias. Scenarios are pre-generated with Claude and fixed for all models. The user simulator drives goal-based conversations, and the tool simulator provides synthetic responses without influencing outcomes. Evaluation uses Claude as a judge, which should theoretically favor Claude (per sycophancy theory), but GPTs still lead.
|
| 340 |
</div>
|
| 341 |
</details>
|
| 342 |
|
| 343 |
<!-- FAQ Item 2 -->
|
| 344 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 345 |
+
<summary style="cursor: pointer; font-weight: 600; color: white; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
|
| 346 |
+
<span style="color: white;"></span> Why does a specific model rank lower when our internal results show otherwise?
|
| 347 |
</summary>
|
| 348 |
+
<div style="margin-top: 12px; padding-left: 28px; color: white; line-height: 1.6;">
|
| 349 |
+
<strong style="color: white;"></strong> Performance varies by prompt, task, complexity, and domain. Our evaluations kept prompts identical across models for consistency. Different evaluation methodologies and task sets can lead to different rankings.
|
| 350 |
</div>
|
| 351 |
</details>
|
| 352 |
|
| 353 |
<!-- FAQ Item 3 -->
|
| 354 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 355 |
+
<summary style="cursor: pointer; font-weight: 600; color: white; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
|
| 356 |
+
<span style="color: white;"></span> Why is my favorite model missing?
|
| 357 |
</summary>
|
| 358 |
+
<div style="margin-top: 12px; padding-left: 28px; color: white; line-height: 1.6;">
|
| 359 |
+
<strong style="color: white;"></strong> We were not able to add certain models either because they were not in our initial list or had issues while running the experiments, such as improper tool call output format. We skipped some of the models which performed poorly in our leaderboard v1.
|
| 360 |
</div>
|
| 361 |
</details>
|
| 362 |
|
| 363 |
<!-- FAQ Item 4 -->
|
| 364 |
<details class="faq-item" style="margin-bottom: 16px; background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--border-subtle);">
|
| 365 |
+
<summary style="cursor: pointer; font-weight: 600; color: white; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
|
| 366 |
+
<span style="color: white;"></span> We were surprised Gemini 2.5 Pro ranked lower. Our internal benchmarks show it's excellent for code research and AI code review tasks.
|
| 367 |
</summary>
|
| 368 |
+
<div style="margin-top: 12px; padding-left: 28px; color: white; line-height: 1.6;">
|
| 369 |
+
<strong style="color: white;"></strong> Results differ because this leaderboard evaluates support agent scenarios only, not coding ones. Different models excel at different types of tasks, and this benchmark focuses specifically on business support agent use cases across banking, healthcare, insurance, investment, and telecom domains.
|
| 370 |
</div>
|
| 371 |
</details>
|
| 372 |
|
| 373 |
<!-- About Metrics -->
|
| 374 |
<div style="margin-top: 32px; padding: 20px; background: #ffd21e0d; border-radius: 12px; border: 1px solid var(--border-default);">
|
| 375 |
+
<h4 style="color: white; margin-top: 0; margin-bottom: 16px; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;">
|
| 376 |
<span style="font-size: 1.3rem;">📊</span>
|
| 377 |
Understanding the Metrics
|
| 378 |
</h4>
|
| 379 |
|
| 380 |
<div style="display: grid; gap: 16px;">
|
| 381 |
<div>
|
| 382 |
+
<h5 style="color: white; margin: 0 0 8px 0; font-size: 1rem;">Action Completion (AC)</h5>
|
| 383 |
+
<p style="color: white; margin: 0; line-height: 1.5;">
|
| 384 |
A score from 0 to 1 measuring how successfully the agent completes the user's requested tasks. This evaluates whether the agent achieves the intended goals, follows instructions accurately, and provides complete solutions. Higher scores indicate better task completion.
|
| 385 |
</p>
|
| 386 |
</div>
|
| 387 |
|
| 388 |
<div>
|
| 389 |
+
<h5 style="color: white; margin: 0 0 8px 0; font-size: 1rem;">Tool Selection Quality (TSQ)</h5>
|
| 390 |
+
<p style="color: white; margin: 0; line-height: 1.5;">
|
| 391 |
A score from 0 to 1 evaluating how well the agent selects and uses the appropriate tools for each task. This includes choosing the right tool, using correct parameters, and proper sequencing of tool calls. Higher scores indicate better tool utilization.
|
| 392 |
</p>
|
| 393 |
</div>
|
| 394 |
|
| 395 |
<div>
|
| 396 |
+
<h5 style="color: white; margin: 0 0 8px 0; font-size: 1rem;">Domain-Specific Performance</h5>
|
| 397 |
+
<p style="color: white; margin: 0; line-height: 1.5;">
|
| 398 |
Models are tested across five business domains: Banking, Healthcare, Insurance, Investment, and Telecom. Each domain has specific scenarios and requirements that test the agent's ability to handle industry-specific tasks and terminology.
|
| 399 |
</p>
|
| 400 |
</div>
|
| 401 |
|
| 402 |
<div>
|
| 403 |
+
<h5 style="color: white; margin: 0 0 8px 0; font-size: 1rem;">Efficiency Metrics</h5>
|
| 404 |
+
<p style="color: white; margin: 0; line-height: 1.5;">
|
| 405 |
• <strong>Cost:</strong> Total API cost per session in USD<br>
|
| 406 |
• <strong>Duration:</strong> Time to complete tasks in seconds<br>
|
| 407 |
• <strong>Turns:</strong> Number of exchanges to reach resolution<br>
|
|
|
|
| 411 |
</div>
|
| 412 |
|
| 413 |
<div style="margin-top: 20px; padding-top: 16px; border-top: 1px solid var(--border-subtle);">
|
| 414 |
+
<p style="color: white; margin: 0; font-size: 0.9rem; line-height: 1.5;">
|
| 415 |
<strong>Learn More:</strong> For detailed methodology and evaluation criteria, visit the
|
| 416 |
+
<a href="https://galileo.ai/blog/agent-leaderboard-v2" target="_blank" style="color: white; text-decoration: none;">
|
| 417 |
official blog post ↗
|
| 418 |
</a>
|
| 419 |
or explore the
|
| 420 |
+
<a href="https://github.com/rungalileo/agent-leaderboard" target="_blank" style="color: white; text-decoration: none;">
|
| 421 |
GitHub repository ↗
|
| 422 |
</a>
|
| 423 |
</p>
|
|
|
|
| 431 |
}
|
| 432 |
|
| 433 |
.faq-item:hover {
|
| 434 |
+
border-color: white !important;
|
| 435 |
box-shadow: 0 4px 12px rgba(255, 210, 30, 0.1);
|
| 436 |
}
|
| 437 |
|
|
|
|
| 444 |
display: inline-block;
|
| 445 |
margin-right: 8px;
|
| 446 |
transition: transform 0.3s ease;
|
| 447 |
+
color: white;
|
| 448 |
}
|
| 449 |
|
| 450 |
.faq-item[open] summary::before {
|
|
|
|
| 452 |
}
|
| 453 |
|
| 454 |
.faq-item summary:hover {
|
| 455 |
+
color: white !important;
|
| 456 |
}
|
| 457 |
</style>
|
| 458 |
</div>
|
styles/leaderboard_styles.py
CHANGED
|
@@ -58,18 +58,18 @@ def get_leaderboard_css():
|
|
| 58 |
}
|
| 59 |
|
| 60 |
p, span, div {
|
| 61 |
-
color:
|
| 62 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 63 |
}
|
| 64 |
|
| 65 |
/* Labels and info text */
|
| 66 |
label {
|
| 67 |
-
color:
|
| 68 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 69 |
}
|
| 70 |
|
| 71 |
.gr-box label {
|
| 72 |
-
color:
|
| 73 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 74 |
}
|
| 75 |
|
|
@@ -154,6 +154,11 @@ def get_leaderboard_css():
|
|
| 154 |
border-color: var(--border-default) !important;
|
| 155 |
}
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
input[type="radio"]:checked {
|
| 158 |
background-color: var(--accent-primary) !important;
|
| 159 |
border-color: var(--accent-primary) !important;
|
|
@@ -164,10 +169,29 @@ def get_leaderboard_css():
|
|
| 164 |
.dropdown {
|
| 165 |
border-color: var(--border-default) !important;
|
| 166 |
background: var(--bg-card) !important;
|
| 167 |
-
color:
|
| 168 |
transition: all 0.2s ease !important;
|
| 169 |
}
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
.dropdown:hover {
|
| 172 |
border-color: var(--accent-primary) !important;
|
| 173 |
box-shadow: 0 0 15px var(--glow-primary) !important;
|
|
@@ -184,12 +208,19 @@ def get_leaderboard_css():
|
|
| 184 |
overflow-y: auto !important;
|
| 185 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 186 |
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
}
|
| 188 |
|
| 189 |
/* Button styling */
|
| 190 |
button {
|
| 191 |
background: var(--bg-card) !important;
|
| 192 |
-
color:
|
| 193 |
border: 1px solid var(--border-default) !important;
|
| 194 |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 195 |
}
|
|
|
|
| 58 |
}
|
| 59 |
|
| 60 |
p, span, div {
|
| 61 |
+
color: white !important;
|
| 62 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 63 |
}
|
| 64 |
|
| 65 |
/* Labels and info text */
|
| 66 |
label {
|
| 67 |
+
color: white !important;
|
| 68 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 69 |
}
|
| 70 |
|
| 71 |
.gr-box label {
|
| 72 |
+
color: white !important;
|
| 73 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 74 |
}
|
| 75 |
|
|
|
|
| 154 |
border-color: var(--border-default) !important;
|
| 155 |
}
|
| 156 |
|
| 157 |
+
/* Radio button labels */
|
| 158 |
+
input[type="radio"] + label {
|
| 159 |
+
color: white !important;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
input[type="radio"]:checked {
|
| 163 |
background-color: var(--accent-primary) !important;
|
| 164 |
border-color: var(--accent-primary) !important;
|
|
|
|
| 169 |
.dropdown {
|
| 170 |
border-color: var(--border-default) !important;
|
| 171 |
background: var(--bg-card) !important;
|
| 172 |
+
color: white !important;
|
| 173 |
transition: all 0.2s ease !important;
|
| 174 |
}
|
| 175 |
|
| 176 |
+
/* Dropdown option styling */
|
| 177 |
+
.dropdown option {
|
| 178 |
+
background: var(--bg-card) !important;
|
| 179 |
+
color: white !important;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
/* Gradio dropdown specific styling */
|
| 183 |
+
.gradio-dropdown select,
|
| 184 |
+
.gradio-dropdown [role="combobox"],
|
| 185 |
+
.gradio-dropdown input {
|
| 186 |
+
color: white !important;
|
| 187 |
+
background: var(--bg-card) !important;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.gradio-dropdown option {
|
| 191 |
+
color: white !important;
|
| 192 |
+
background: var(--bg-card) !important;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
.dropdown:hover {
|
| 196 |
border-color: var(--accent-primary) !important;
|
| 197 |
box-shadow: 0 0 15px var(--glow-primary) !important;
|
|
|
|
| 208 |
overflow-y: auto !important;
|
| 209 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 210 |
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
|
| 211 |
+
color: white !important;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
/* Table cells and headers */
|
| 215 |
+
.dataframe td,
|
| 216 |
+
.dataframe th {
|
| 217 |
+
color: white !important;
|
| 218 |
}
|
| 219 |
|
| 220 |
/* Button styling */
|
| 221 |
button {
|
| 222 |
background: var(--bg-card) !important;
|
| 223 |
+
color: white !important;
|
| 224 |
border: 1px solid var(--border-default) !important;
|
| 225 |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 226 |
}
|
tabs/leaderboard_v1.py
CHANGED
|
@@ -201,7 +201,7 @@ def create_leaderboard_v2_tab():
|
|
| 201 |
for idx in range(n)
|
| 202 |
)
|
| 203 |
label_spans = "\n".join(
|
| 204 |
-
f'<text x="{point(1.1, idx)[0]:.2f}" y="{point(1.1, idx)[1]:.2f}" text-anchor="middle" dominant-baseline="middle" font-size="9" fill="
|
| 205 |
for idx, label in enumerate(labels)
|
| 206 |
)
|
| 207 |
svg = f"""
|
|
@@ -228,32 +228,32 @@ def create_leaderboard_v2_tab():
|
|
| 228 |
"description": "7개의 태스크 전반의 평균 성능을 한눈에 살펴보고 각 레벨 비교를 위한 기준점을 제공합니다."
|
| 229 |
},
|
| 230 |
"L1": {
|
| 231 |
-
"title": "L1 · 단일 도구
|
| 232 |
-
"description": "
|
| 233 |
},
|
| 234 |
"L2": {
|
| 235 |
-
"title": "L2 · 도구 선택
|
| 236 |
-
"description": "
|
| 237 |
},
|
| 238 |
"L3": {
|
| 239 |
-
"title": "L3 · 순차적 추론 (Chaining)",
|
| 240 |
-
"description": "
|
| 241 |
},
|
| 242 |
"L4": {
|
| 243 |
-
"title": "L4 · 병렬적 추론 (Aggregation)",
|
| 244 |
-
"description": "
|
| 245 |
},
|
| 246 |
"L5": {
|
| 247 |
-
"title": "L5 · 강건성 (Robustness / Fallback)",
|
| 248 |
-
"description": "
|
| 249 |
},
|
| 250 |
"L6": {
|
| 251 |
-
"title": "L6 · 효율성 (Efficiency)",
|
| 252 |
-
"description": "
|
| 253 |
},
|
| 254 |
"L7": {
|
| 255 |
-
"title": "L7 · 장기 컨텍스트 기억 (Contextual Memory)",
|
| 256 |
-
"description": "
|
| 257 |
}
|
| 258 |
}
|
| 259 |
default_level = "ALL"
|
|
@@ -291,7 +291,7 @@ def create_leaderboard_v2_tab():
|
|
| 291 |
border-collapse: collapse;
|
| 292 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 293 |
background: var(--bg-card);
|
| 294 |
-
color:
|
| 295 |
}
|
| 296 |
|
| 297 |
.v2-styled-table thead {
|
|
@@ -305,7 +305,7 @@ def create_leaderboard_v2_tab():
|
|
| 305 |
padding: 14px 12px;
|
| 306 |
text-align: left;
|
| 307 |
font-weight: 600;
|
| 308 |
-
color:
|
| 309 |
border-bottom: 2px solid var(--accent-primary);
|
| 310 |
font-size: 13px;
|
| 311 |
text-transform: uppercase;
|
|
@@ -319,7 +319,7 @@ def create_leaderboard_v2_tab():
|
|
| 319 |
.v2-styled-table td {
|
| 320 |
padding: 12px;
|
| 321 |
border-bottom: 1px solid var(--border-subtle);
|
| 322 |
-
color:
|
| 323 |
transition: all 0.2s ease;
|
| 324 |
}
|
| 325 |
|
|
@@ -339,13 +339,13 @@ def create_leaderboard_v2_tab():
|
|
| 339 |
|
| 340 |
.model-name {
|
| 341 |
font-weight: 500;
|
| 342 |
-
color:
|
| 343 |
transition: color 0.2s ease;
|
| 344 |
}
|
| 345 |
|
| 346 |
/* Keep model name color consistent on hover to emphasize row highlight */
|
| 347 |
.v2-styled-table tr:hover .model-name {
|
| 348 |
-
color:
|
| 349 |
}
|
| 350 |
|
| 351 |
.numeric-cell {
|
|
@@ -356,12 +356,12 @@ def create_leaderboard_v2_tab():
|
|
| 356 |
|
| 357 |
.highlight-header {
|
| 358 |
background: rgba(255, 210, 30, 0.14);
|
| 359 |
-
color:
|
| 360 |
}
|
| 361 |
|
| 362 |
.highlight-cell {
|
| 363 |
background: rgba(255, 210, 30, 0.08);
|
| 364 |
-
color:
|
| 365 |
font-weight: 600;
|
| 366 |
}
|
| 367 |
</style>
|
|
@@ -459,8 +459,8 @@ def create_leaderboard_v2_tab():
|
|
| 459 |
return f"""
|
| 460 |
<div class="domain-selector-container leaderboard-intro">
|
| 461 |
<div class="domain-header">
|
| 462 |
-
<h2 class="domain-title">Agent Leaderboard · {level_title}</h2>
|
| 463 |
-
<p class="domain-subtitle">{level_description}</p>
|
| 464 |
</div>
|
| 465 |
<div class="dataframe-container">
|
| 466 |
"""
|
|
@@ -1097,14 +1097,14 @@ def create_leaderboard_v2_tab():
|
|
| 1097 |
# Links section below title
|
| 1098 |
gr.HTML("""
|
| 1099 |
<div class="hero-actions">
|
| 1100 |
-
<a href="
|
| 1101 |
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1102 |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
| 1103 |
<line x1="8" y1="12" x2="16" y2="12"/>
|
| 1104 |
</svg>
|
| 1105 |
<span>Blog</span>
|
| 1106 |
</a>
|
| 1107 |
-
<a href="https://github.com/
|
| 1108 |
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1109 |
<path d="M9 19c-5 1.5-5-2.5-7-3"/>
|
| 1110 |
<path d="M20 21v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
|
@@ -1145,11 +1145,11 @@ def create_leaderboard_v2_tab():
|
|
| 1145 |
<span>80%</span>
|
| 1146 |
</div>
|
| 1147 |
<ul class="phase-list">
|
| 1148 |
-
<li>L1: 단일 도구 실행</li>
|
| 1149 |
-
<li>L2: 도구 선택 능력</li>
|
| 1150 |
-
<li>L3: 순차적 reasoning (Chaining)</li>
|
| 1151 |
-
<li>L4: 병렬적 reasoning (Aggregation)</li>
|
| 1152 |
-
<li>L5: 강건성 (Robustness / Fallback)</li>
|
| 1153 |
</ul>
|
| 1154 |
</div>
|
| 1155 |
<div class="phase-card">
|
|
@@ -1158,8 +1158,8 @@ def create_leaderboard_v2_tab():
|
|
| 1158 |
<span>20%</span>
|
| 1159 |
</div>
|
| 1160 |
<ul class="phase-list">
|
| 1161 |
-
<li>L6: 효율성 (Efficiency)</li>
|
| 1162 |
-
<li>L7: 장기 컨텍스트 기억 (Contextual Memory)</li>
|
| 1163 |
</ul>
|
| 1164 |
</div>
|
| 1165 |
</div>
|
|
@@ -1943,8 +1943,8 @@ def create_leaderboard_v2_tab():
|
|
| 1943 |
with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
|
| 1944 |
gr.HTML("""
|
| 1945 |
<div class="domain-header">
|
| 1946 |
-
<h2 class="domain-title">🧠 Select Task Level</h2>
|
| 1947 |
-
<p class="domain-subtitle">Ko-AgentBench의 ALL · L1~L7 단계별 에이전트 성능을 손쉽게 비교하세요.</p>
|
| 1948 |
</div>
|
| 1949 |
""")
|
| 1950 |
domain_filter = gr.Radio(
|
|
@@ -1960,14 +1960,14 @@ def create_leaderboard_v2_tab():
|
|
| 1960 |
with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
|
| 1961 |
gr.HTML("""
|
| 1962 |
<div class="domain-header">
|
| 1963 |
-
<h2 class="domain-title">🔍 Filters & Sorting</h2>
|
| 1964 |
-
<p class="domain-subtitle">모델 접근 방식과 정렬 순서를 선택해 맞춤 뷰를 구성하세요.</p>
|
| 1965 |
</div>
|
| 1966 |
""")
|
| 1967 |
with gr.Row(elem_classes=["filters-sorting-row"]):
|
| 1968 |
with gr.Column(scale=1, elem_classes=["filter-group"]):
|
| 1969 |
with gr.Row(elem_classes=["filter-group-row"]):
|
| 1970 |
-
gr.HTML("<span class='filter-group-label'>Model Access</span>")
|
| 1971 |
model_type_filter = gr.Radio(
|
| 1972 |
choices=["All", "OSS", "API"],
|
| 1973 |
value="All",
|
|
@@ -1977,7 +1977,7 @@ def create_leaderboard_v2_tab():
|
|
| 1977 |
)
|
| 1978 |
with gr.Column(scale=1, elem_classes=["filter-group"]):
|
| 1979 |
with gr.Row(elem_classes=["filter-group-row"]):
|
| 1980 |
-
gr.HTML("<span class='filter-group-label'>Sort Order</span>")
|
| 1981 |
sort_order = gr.Radio(
|
| 1982 |
choices=["Descending", "Ascending"],
|
| 1983 |
value="Descending",
|
|
@@ -1999,16 +1999,16 @@ def create_leaderboard_v2_tab():
|
|
| 1999 |
gr.HTML("""
|
| 2000 |
<div class="domain-selector-container domain-performance-container">
|
| 2001 |
<div class="domain-header">
|
| 2002 |
-
<h2 class="domain-title">Core Capability Radar</h2>
|
| 2003 |
-
<p class="domain-subtitle">Track six essential pillars: Success, Execution, Reasoning, Robustness, Efficiency, and Call Validity.</p>
|
| 2004 |
</div>
|
| 2005 |
""")
|
| 2006 |
|
| 2007 |
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
|
| 2008 |
gr.HTML("""
|
| 2009 |
<div class="domain-header">
|
| 2010 |
-
<h2 class="domain-title">🎯 Select Models for Comparison</h2>
|
| 2011 |
-
<p class="domain-subtitle">Choose up to 5 models to map on the capability radar.</p>
|
| 2012 |
</div>
|
| 2013 |
""")
|
| 2014 |
model_selector = gr.Dropdown(
|
|
@@ -2039,16 +2039,16 @@ def create_leaderboard_v2_tab():
|
|
| 2039 |
gr.HTML("""
|
| 2040 |
<div class="domain-selector-container domain-performance-container level-metrics-wrapper">
|
| 2041 |
<div class="domain-header">
|
| 2042 |
-
<h2 class="domain-title">Level-Specific Metric Spotlight</h2>
|
| 2043 |
-
<p class="domain-subtitle">Dive deeper into each Ko-AgentBench stage and compare model scores across its unique evaluation metrics.</p>
|
| 2044 |
</div>
|
| 2045 |
""")
|
| 2046 |
|
| 2047 |
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
|
| 2048 |
gr.HTML("""
|
| 2049 |
<div class="domain-header">
|
| 2050 |
-
<h2 class="domain-title">🧭 Select Task Level and Models</h2>
|
| 2051 |
-
<p class="domain-subtitle">Choose a level and up to 5 models to explore their detailed SR-driven metrics.</p>
|
| 2052 |
</div>
|
| 2053 |
""")
|
| 2054 |
level_metric_selector = gr.Dropdown(
|
|
@@ -2085,8 +2085,8 @@ def create_leaderboard_v2_tab():
|
|
| 2085 |
gr.HTML("""
|
| 2086 |
<div class="domain-selector-container domain-performance-container heatmap-wrapper">
|
| 2087 |
<div class="domain-header">
|
| 2088 |
-
<h2 class="domain-title">Comprehensive Performance Heatmap</h2>
|
| 2089 |
-
<p class="domain-subtitle">View Ko-AgentBench SR scores across L1~L7 for each model in a single glance.</p>
|
| 2090 |
</div>
|
| 2091 |
<div class="chart-container heatmap-chart-container">
|
| 2092 |
""")
|
|
@@ -2451,8 +2451,8 @@ def create_leaderboard_v2_tab():
|
|
| 2451 |
gr.HTML("""
|
| 2452 |
<div class="domain-selector-container performance-card-container">
|
| 2453 |
<div class="domain-header">
|
| 2454 |
-
<h2 class="domain-title">Model Performance Card</h2>
|
| 2455 |
-
<p class="domain-subtitle">Comprehensive performance card for any model - perfect for presentations and reports</p>
|
| 2456 |
</div>
|
| 2457 |
<div class="performance-card-content">
|
| 2458 |
""")
|
|
@@ -2460,8 +2460,8 @@ def create_leaderboard_v2_tab():
|
|
| 2460 |
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
|
| 2461 |
gr.HTML("""
|
| 2462 |
<div class="domain-header">
|
| 2463 |
-
<h2 class="domain-title">🤖 Select Model</h2>
|
| 2464 |
-
<p class="domain-subtitle">비교할 모델을 선택하세요.</p>
|
| 2465 |
</div>
|
| 2466 |
""")
|
| 2467 |
card_model_selector = gr.Dropdown(
|
|
@@ -3109,7 +3109,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3109 |
hoverlabel=dict(
|
| 3110 |
bgcolor="rgba(1, 9, 26, 0.95)",
|
| 3111 |
bordercolor=colors['line'],
|
| 3112 |
-
font=dict(color="
|
| 3113 |
)
|
| 3114 |
)
|
| 3115 |
)
|
|
@@ -3132,7 +3132,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3132 |
ticktext=tick_text,
|
| 3133 |
tickfont=dict(
|
| 3134 |
size=11,
|
| 3135 |
-
color='
|
| 3136 |
family="'Geist Mono', monospace"
|
| 3137 |
)
|
| 3138 |
),
|
|
@@ -3144,7 +3144,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3144 |
tickfont=dict(
|
| 3145 |
size=13,
|
| 3146 |
family="'Geist', sans-serif",
|
| 3147 |
-
color='
|
| 3148 |
weight=600
|
| 3149 |
),
|
| 3150 |
rotation=90,
|
|
@@ -3158,7 +3158,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3158 |
y=-0.15,
|
| 3159 |
xanchor="center",
|
| 3160 |
x=0.5,
|
| 3161 |
-
font=dict(size=12, family="'Geist', sans-serif", color='
|
| 3162 |
bgcolor='rgba(1, 9, 26, 0.8)',
|
| 3163 |
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 3164 |
borderwidth=1,
|
|
@@ -3172,7 +3172,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3172 |
font=dict(
|
| 3173 |
size=22,
|
| 3174 |
family="'Geist', sans-serif",
|
| 3175 |
-
color="
|
| 3176 |
weight=700
|
| 3177 |
),
|
| 3178 |
),
|
|
@@ -3271,8 +3271,8 @@ def create_performance_heatmap(df, ordered_models=None, max_models=12):
|
|
| 3271 |
hovertemplate="<b>%{y}</b><br><span style='color:#FFD21E'>%{x}</span><br>SR · %{z:.3f}<extra></extra>",
|
| 3272 |
colorbar=dict(
|
| 3273 |
title="Success Rate",
|
| 3274 |
-
titlefont=dict(color="
|
| 3275 |
-
tickfont=dict(color="
|
| 3276 |
thickness=12,
|
| 3277 |
len=0.7,
|
| 3278 |
outlinecolor="rgba(255, 255, 255, 0.1)",
|
|
@@ -3309,17 +3309,17 @@ def create_performance_heatmap(df, ordered_models=None, max_models=12):
|
|
| 3309 |
margin=dict(t=80, b=90, l=110, r=160),
|
| 3310 |
height=520,
|
| 3311 |
width=1450,
|
| 3312 |
-
font=dict(family="'Geist', sans-serif", color="
|
| 3313 |
xaxis=dict(
|
| 3314 |
tickangle=-25,
|
| 3315 |
showgrid=False,
|
| 3316 |
ticks="",
|
| 3317 |
-
tickfont=dict(size=11, family="'Geist', sans-serif", color="
|
| 3318 |
),
|
| 3319 |
yaxis=dict(
|
| 3320 |
showgrid=False,
|
| 3321 |
ticks="",
|
| 3322 |
-
tickfont=dict(size=12, family="'Geist', sans-serif", color="
|
| 3323 |
),
|
| 3324 |
annotations=annotations,
|
| 3325 |
title=dict(
|
|
@@ -3329,7 +3329,7 @@ def create_performance_heatmap(df, ordered_models=None, max_models=12):
|
|
| 3329 |
font=dict(
|
| 3330 |
size=20,
|
| 3331 |
family="'Geist', sans-serif",
|
| 3332 |
-
color="
|
| 3333 |
weight=700
|
| 3334 |
),
|
| 3335 |
)
|
|
@@ -3349,7 +3349,7 @@ def create_empty_heatmap(message):
|
|
| 3349 |
xanchor='center', yanchor='middle',
|
| 3350 |
font=dict(
|
| 3351 |
size=18,
|
| 3352 |
-
color="
|
| 3353 |
family="'Geist', sans-serif"
|
| 3354 |
),
|
| 3355 |
showarrow=False,
|
|
@@ -3372,7 +3372,7 @@ def create_empty_heatmap(message):
|
|
| 3372 |
font=dict(
|
| 3373 |
size=20,
|
| 3374 |
family="'Geist', sans-serif",
|
| 3375 |
-
color="
|
| 3376 |
weight=700
|
| 3377 |
),
|
| 3378 |
)
|
|
@@ -3500,24 +3500,24 @@ def create_level_metric_chart(df, level, selected_models=None, max_models=5):
|
|
| 3500 |
bgcolor='rgba(1, 9, 26, 0.75)',
|
| 3501 |
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 3502 |
borderwidth=1,
|
| 3503 |
-
font=dict(size=11, family="'Geist', sans-serif", color='
|
| 3504 |
),
|
| 3505 |
xaxis=dict(
|
| 3506 |
-
title=dict(text=f"<b>{level} Metric Score</b>", font=dict(size=14, color="
|
| 3507 |
-
tickfont=dict(size=11, color="
|
| 3508 |
gridcolor='rgba(245, 246, 247, 0.08)',
|
| 3509 |
zerolinecolor='rgba(245, 246, 247, 0.18)',
|
| 3510 |
range=x_range
|
| 3511 |
),
|
| 3512 |
yaxis=dict(
|
| 3513 |
-
tickfont=dict(size=13, color="
|
| 3514 |
automargin=True
|
| 3515 |
),
|
| 3516 |
title=dict(
|
| 3517 |
text=f"<b>{level} Metric Breakdown</b>",
|
| 3518 |
x=0.5,
|
| 3519 |
y=0.98,
|
| 3520 |
-
font=dict(size=20, family="'Geist', sans-serif", color="
|
| 3521 |
)
|
| 3522 |
)
|
| 3523 |
return fig
|
|
@@ -3530,7 +3530,7 @@ def create_empty_level_metric_chart(message):
|
|
| 3530 |
xref="paper", yref="paper",
|
| 3531 |
x=0.5, y=0.5,
|
| 3532 |
xanchor='center', yanchor='middle',
|
| 3533 |
-
font=dict(size=18, color="
|
| 3534 |
showarrow=False,
|
| 3535 |
bgcolor="rgba(245, 246, 247, 0.05)",
|
| 3536 |
bordercolor="rgba(245, 246, 247, 0.2)",
|
|
@@ -3547,7 +3547,7 @@ def create_empty_level_metric_chart(message):
|
|
| 3547 |
text="<b>Level Metric Breakdown</b>",
|
| 3548 |
x=0.5,
|
| 3549 |
y=0.98,
|
| 3550 |
-
font=dict(size=20, family="'Geist', sans-serif", color="
|
| 3551 |
)
|
| 3552 |
)
|
| 3553 |
fig.update_xaxes(visible=False)
|
|
@@ -3566,7 +3566,7 @@ def create_empty_radar_chart(message):
|
|
| 3566 |
xanchor='center', yanchor='middle',
|
| 3567 |
font=dict(
|
| 3568 |
size=18,
|
| 3569 |
-
color="
|
| 3570 |
family="'Geist', sans-serif"
|
| 3571 |
),
|
| 3572 |
showarrow=False,
|
|
@@ -3589,7 +3589,7 @@ def create_empty_radar_chart(message):
|
|
| 3589 |
font=dict(
|
| 3590 |
size=22,
|
| 3591 |
family="'Geist', sans-serif",
|
| 3592 |
-
color="
|
| 3593 |
weight=700
|
| 3594 |
),
|
| 3595 |
),
|
|
@@ -3647,7 +3647,7 @@ def create_cost_performance_scatter(df, metric="Avg AC"):
|
|
| 3647 |
name=legend_name,
|
| 3648 |
text=df_type['Model'],
|
| 3649 |
textposition="top center",
|
| 3650 |
-
textfont=dict(size=10, color='
|
| 3651 |
marker=dict(
|
| 3652 |
size=df_type['Avg Turns'] * 3, # Size based on number of turns
|
| 3653 |
color=color_map.get(model_type, '#F5F6F7'),
|
|
@@ -3671,7 +3671,7 @@ def create_cost_performance_scatter(df, metric="Avg AC"):
|
|
| 3671 |
# Add quadrant labels
|
| 3672 |
fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance<br>Low Cost",
|
| 3673 |
showarrow=False, xref="paper", yref="paper",
|
| 3674 |
-
font=dict(size=12, color="
|
| 3675 |
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance<br>High Cost",
|
| 3676 |
showarrow=False, xref="paper", yref="paper",
|
| 3677 |
font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
|
|
@@ -3683,23 +3683,23 @@ def create_cost_performance_scatter(df, metric="Avg AC"):
|
|
| 3683 |
text=f"<b>Cost-Performance Efficiency: {metric_display}</b>",
|
| 3684 |
x=0.5,
|
| 3685 |
y=0.97,
|
| 3686 |
-
font=dict(size=22, family="'Geist', sans-serif", color="
|
| 3687 |
),
|
| 3688 |
xaxis=dict(
|
| 3689 |
title=dict(
|
| 3690 |
text=f"<b>{metric_display}</b>",
|
| 3691 |
-
font=dict(size=16, color="
|
| 3692 |
),
|
| 3693 |
-
tickfont=dict(size=12, color="
|
| 3694 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3695 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3696 |
),
|
| 3697 |
yaxis=dict(
|
| 3698 |
title=dict(
|
| 3699 |
text="<b>Average Session Cost ($)</b>",
|
| 3700 |
-
font=dict(size=16, color="
|
| 3701 |
),
|
| 3702 |
-
tickfont=dict(size=12, color="
|
| 3703 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3704 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3705 |
),
|
|
@@ -3714,7 +3714,7 @@ def create_cost_performance_scatter(df, metric="Avg AC"):
|
|
| 3714 |
y=1.02,
|
| 3715 |
xanchor="right",
|
| 3716 |
x=1,
|
| 3717 |
-
font=dict(size=12, family="'Geist', sans-serif", color='
|
| 3718 |
bgcolor='rgba(1, 9, 26, 0.8)',
|
| 3719 |
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 3720 |
borderwidth=1
|
|
@@ -3749,7 +3749,7 @@ def create_speed_accuracy_plot(df, metric="Avg AC"):
|
|
| 3749 |
mode='markers+text',
|
| 3750 |
text=df_filtered['Model'],
|
| 3751 |
textposition="top center",
|
| 3752 |
-
textfont=dict(size=9, color='
|
| 3753 |
marker=dict(
|
| 3754 |
size=12,
|
| 3755 |
color=df_filtered['Avg Total Cost'],
|
|
@@ -3758,9 +3758,9 @@ def create_speed_accuracy_plot(df, metric="Avg AC"):
|
|
| 3758 |
colorbar=dict(
|
| 3759 |
title=dict(
|
| 3760 |
text="Cost ($)",
|
| 3761 |
-
font=dict(color="
|
| 3762 |
),
|
| 3763 |
-
tickfont=dict(color="
|
| 3764 |
bgcolor="rgba(1, 9, 26, 0.8)",
|
| 3765 |
bordercolor="rgba(245, 246, 247, 0.2)",
|
| 3766 |
borderwidth=1,
|
|
@@ -3785,7 +3785,7 @@ def create_speed_accuracy_plot(df, metric="Avg AC"):
|
|
| 3785 |
# Add quadrant labels
|
| 3786 |
fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate",
|
| 3787 |
showarrow=False, xref="paper", yref="paper",
|
| 3788 |
-
font=dict(size=12, color="
|
| 3789 |
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
|
| 3790 |
showarrow=False, xref="paper", yref="paper",
|
| 3791 |
font=dict(size=12, color="#ffd21e", weight=600))
|
|
@@ -3797,23 +3797,23 @@ def create_speed_accuracy_plot(df, metric="Avg AC"):
|
|
| 3797 |
text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>",
|
| 3798 |
x=0.5,
|
| 3799 |
y=0.97,
|
| 3800 |
-
font=dict(size=22, family="'Geist', sans-serif", color="
|
| 3801 |
),
|
| 3802 |
xaxis=dict(
|
| 3803 |
title=dict(
|
| 3804 |
text=f"<b>{metric_display}</b>",
|
| 3805 |
-
font=dict(size=16, color="
|
| 3806 |
),
|
| 3807 |
-
tickfont=dict(size=12, color="
|
| 3808 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3809 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3810 |
),
|
| 3811 |
yaxis=dict(
|
| 3812 |
title=dict(
|
| 3813 |
text="<b>Average Session Duration (seconds)</b>",
|
| 3814 |
-
font=dict(size=16, color="
|
| 3815 |
),
|
| 3816 |
-
tickfont=dict(size=12, color="
|
| 3817 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3818 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3819 |
),
|
|
@@ -3877,9 +3877,9 @@ def create_domain_specialization_matrix(df, metric_type="AC"):
|
|
| 3877 |
colorbar=dict(
|
| 3878 |
title=dict(
|
| 3879 |
text="Specialization<br>Strength",
|
| 3880 |
-
font=dict(color="
|
| 3881 |
),
|
| 3882 |
-
tickfont=dict(color="
|
| 3883 |
bgcolor="rgba(1, 9, 26, 0.8)",
|
| 3884 |
bordercolor="rgba(245, 246, 247, 0.2)",
|
| 3885 |
borderwidth=1
|
|
@@ -3902,22 +3902,22 @@ def create_domain_specialization_matrix(df, metric_type="AC"):
|
|
| 3902 |
text=f"<b>Domain Specialization Matrix: {metric_display}</b>",
|
| 3903 |
x=0.5,
|
| 3904 |
y=0.97,
|
| 3905 |
-
font=dict(size=22, family="'Geist', sans-serif", color="
|
| 3906 |
),
|
| 3907 |
xaxis=dict(
|
| 3908 |
title=dict(
|
| 3909 |
text="<b>Business Domains</b>",
|
| 3910 |
-
font=dict(size=16, color="
|
| 3911 |
),
|
| 3912 |
-
tickfont=dict(size=13, color="
|
| 3913 |
gridcolor="rgba(245, 246, 247, 0.1)"
|
| 3914 |
),
|
| 3915 |
yaxis=dict(
|
| 3916 |
title=dict(
|
| 3917 |
text="<b>Models</b>",
|
| 3918 |
-
font=dict(size=16, color="
|
| 3919 |
),
|
| 3920 |
-
tickfont=dict(size=11, color="
|
| 3921 |
gridcolor="rgba(245, 246, 247, 0.1)"
|
| 3922 |
),
|
| 3923 |
paper_bgcolor="#01091A",
|
|
@@ -4011,7 +4011,7 @@ def create_performance_gap_analysis(df, metric_type="AC"):
|
|
| 4011 |
x=[row['Min'], row['Max']],
|
| 4012 |
y=[row['Domain'], row['Domain']],
|
| 4013 |
mode='markers',
|
| 4014 |
-
marker=dict(size=8, color='
|
| 4015 |
showlegend=False,
|
| 4016 |
hoverinfo='skip'
|
| 4017 |
))
|
|
@@ -4023,23 +4023,23 @@ def create_performance_gap_analysis(df, metric_type="AC"):
|
|
| 4023 |
text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>",
|
| 4024 |
x=0.5,
|
| 4025 |
y=0.97,
|
| 4026 |
-
font=dict(size=22, family="'Geist', sans-serif", color="
|
| 4027 |
),
|
| 4028 |
xaxis=dict(
|
| 4029 |
title=dict(
|
| 4030 |
text=f"<b>{metric_display} Score</b>",
|
| 4031 |
-
font=dict(size=16, color="
|
| 4032 |
),
|
| 4033 |
-
tickfont=dict(size=12, color="
|
| 4034 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 4035 |
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
|
| 4036 |
),
|
| 4037 |
yaxis=dict(
|
| 4038 |
title=dict(
|
| 4039 |
text="<b>Business Domain</b>",
|
| 4040 |
-
font=dict(size=16, color="
|
| 4041 |
),
|
| 4042 |
-
tickfont=dict(size=13, color="
|
| 4043 |
gridcolor="rgba(245, 246, 247, 0.1)"
|
| 4044 |
),
|
| 4045 |
paper_bgcolor="#01091A",
|
|
@@ -4056,7 +4056,7 @@ def create_performance_gap_analysis(df, metric_type="AC"):
|
|
| 4056 |
xref="paper", yref="paper",
|
| 4057 |
x=0.98, y=0.02,
|
| 4058 |
xanchor='right', yanchor='bottom',
|
| 4059 |
-
font=dict(size=12, color='
|
| 4060 |
showarrow=False
|
| 4061 |
)
|
| 4062 |
|
|
@@ -4074,7 +4074,7 @@ def create_empty_chart(message):
|
|
| 4074 |
xanchor='center', yanchor='middle',
|
| 4075 |
font=dict(
|
| 4076 |
size=18,
|
| 4077 |
-
color="
|
| 4078 |
family="'Geist', sans-serif"
|
| 4079 |
),
|
| 4080 |
showarrow=False,
|
|
|
|
| 201 |
for idx in range(n)
|
| 202 |
)
|
| 203 |
label_spans = "\n".join(
|
| 204 |
+
f'<text x="{point(1.1, idx)[0]:.2f}" y="{point(1.1, idx)[1]:.2f}" text-anchor="middle" dominant-baseline="middle" font-size="9" fill="white">{label}</text>'
|
| 205 |
for idx, label in enumerate(labels)
|
| 206 |
)
|
| 207 |
svg = f"""
|
|
|
|
| 228 |
"description": "7개의 태스크 전반의 평균 성능을 한눈에 살펴보고 각 레벨 비교를 위한 기준점을 제공합니다."
|
| 229 |
},
|
| 230 |
"L1": {
|
| 231 |
+
"title": "<span style='color: white;'>L1 · 단일 도구 실행</span>",
|
| 232 |
+
"description": "<span style='color: white;'>단일 도구 실행 능력과 기본적인 명령 수행 정확도를 평가합니다.</span>"
|
| 233 |
},
|
| 234 |
"L2": {
|
| 235 |
+
"title": "<span style='color: white;'>L2 · 도구 선택 능력</span>",
|
| 236 |
+
"description": "<span style='color: white;'>요구 사항에 맞는 도구를 고르고 적절한 파라미터로 호출하는 능력을 측정합니다.</span>"
|
| 237 |
},
|
| 238 |
"L3": {
|
| 239 |
+
"title": "<span style='color: white;'>L3 · 순차적 추론 (Chaining)</span>",
|
| 240 |
+
"description": "<span style='color: white;'>복수 단계의 순차적 reasoning을 통해 문제를 해결하는 과정을 검증합니다.</span>"
|
| 241 |
},
|
| 242 |
"L4": {
|
| 243 |
+
"title": "<span style='color: white;'>L4 · 병렬적 추론 (Aggregation)</span>",
|
| 244 |
+
"description": "<span style='color: white;'>여러 소스의 정보를 병렬적으로 통합하고 요약하는 능력을 평가합니다.</span>"
|
| 245 |
},
|
| 246 |
"L5": {
|
| 247 |
+
"title": "<span style='color: white;'>L5 · 강건성 (Robustness / Fallback)</span>",
|
| 248 |
+
"description": "<span style='color: white;'>예상치 못한 오류나 실패 상황에 대한 인지와 대응 전략을 확인합니다.</span>"
|
| 249 |
},
|
| 250 |
"L6": {
|
| 251 |
+
"title": "<span style='color: white;'>L6 · 효율성 (Efficiency)</span>",
|
| 252 |
+
"description": "<span style='color: white;'>최소한의 호출과 비용으로 목표를 달성하는 운영 효율을 살펴봅니다.</span>"
|
| 253 |
},
|
| 254 |
"L7": {
|
| 255 |
+
"title": "<span style='color: white;'>L7 · 장기 컨텍스트 기억 (Contextual Memory)</span>",
|
| 256 |
+
"description": "<span style='color: white;'>장기 대화 맥락을 유지하고 적절히 활용하는 능력을 집중적으로 분석합니다.</span>"
|
| 257 |
}
|
| 258 |
}
|
| 259 |
default_level = "ALL"
|
|
|
|
| 291 |
border-collapse: collapse;
|
| 292 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 293 |
background: var(--bg-card);
|
| 294 |
+
color: white;
|
| 295 |
}
|
| 296 |
|
| 297 |
.v2-styled-table thead {
|
|
|
|
| 305 |
padding: 14px 12px;
|
| 306 |
text-align: left;
|
| 307 |
font-weight: 600;
|
| 308 |
+
color: white;
|
| 309 |
border-bottom: 2px solid var(--accent-primary);
|
| 310 |
font-size: 13px;
|
| 311 |
text-transform: uppercase;
|
|
|
|
| 319 |
.v2-styled-table td {
|
| 320 |
padding: 12px;
|
| 321 |
border-bottom: 1px solid var(--border-subtle);
|
| 322 |
+
color: white;
|
| 323 |
transition: all 0.2s ease;
|
| 324 |
}
|
| 325 |
|
|
|
|
| 339 |
|
| 340 |
.model-name {
|
| 341 |
font-weight: 500;
|
| 342 |
+
color: white;
|
| 343 |
transition: color 0.2s ease;
|
| 344 |
}
|
| 345 |
|
| 346 |
/* Keep model name color consistent on hover to emphasize row highlight */
|
| 347 |
.v2-styled-table tr:hover .model-name {
|
| 348 |
+
color: white;
|
| 349 |
}
|
| 350 |
|
| 351 |
.numeric-cell {
|
|
|
|
| 356 |
|
| 357 |
.highlight-header {
|
| 358 |
background: rgba(255, 210, 30, 0.14);
|
| 359 |
+
color: white;
|
| 360 |
}
|
| 361 |
|
| 362 |
.highlight-cell {
|
| 363 |
background: rgba(255, 210, 30, 0.08);
|
| 364 |
+
color: white;
|
| 365 |
font-weight: 600;
|
| 366 |
}
|
| 367 |
</style>
|
|
|
|
| 459 |
return f"""
|
| 460 |
<div class="domain-selector-container leaderboard-intro">
|
| 461 |
<div class="domain-header">
|
| 462 |
+
<h2 class="domain-title" style="color: white;">Agent Leaderboard · {level_title}</h2>
|
| 463 |
+
<p class="domain-subtitle" style="color: white;">{level_description}</p>
|
| 464 |
</div>
|
| 465 |
<div class="dataframe-container">
|
| 466 |
"""
|
|
|
|
| 1097 |
# Links section below title
|
| 1098 |
gr.HTML("""
|
| 1099 |
<div class="hero-actions">
|
| 1100 |
+
<a href="https://hugging-face-krew.github.io/" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1101 |
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1102 |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
| 1103 |
<line x1="8" y1="12" x2="16" y2="12"/>
|
| 1104 |
</svg>
|
| 1105 |
<span>Blog</span>
|
| 1106 |
</a>
|
| 1107 |
+
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1108 |
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1109 |
<path d="M9 19c-5 1.5-5-2.5-7-3"/>
|
| 1110 |
<path d="M20 21v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
|
|
|
| 1145 |
<span>80%</span>
|
| 1146 |
</div>
|
| 1147 |
<ul class="phase-list">
|
| 1148 |
+
<li style="color: white;">L1: 단일 도구 실행</li>
|
| 1149 |
+
<li style="color: white;">L2: 도구 선택 능력</li>
|
| 1150 |
+
<li style="color: white;">L3: 순차적 reasoning (Chaining)</li>
|
| 1151 |
+
<li style="color: white;">L4: 병렬적 reasoning (Aggregation)</li>
|
| 1152 |
+
<li style="color: white;">L5: 강건성 (Robustness / Fallback)</li>
|
| 1153 |
</ul>
|
| 1154 |
</div>
|
| 1155 |
<div class="phase-card">
|
|
|
|
| 1158 |
<span>20%</span>
|
| 1159 |
</div>
|
| 1160 |
<ul class="phase-list">
|
| 1161 |
+
<li style="color: white;">L6: 효율성 (Efficiency)</li>
|
| 1162 |
+
<li style="color: white;">L7: 장기 컨텍스트 기억 (Contextual Memory)</li>
|
| 1163 |
</ul>
|
| 1164 |
</div>
|
| 1165 |
</div>
|
|
|
|
| 1943 |
with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
|
| 1944 |
gr.HTML("""
|
| 1945 |
<div class="domain-header">
|
| 1946 |
+
<h2 class="domain-title" style="color: white;">🧠 Select Task Level</h2>
|
| 1947 |
+
<p class="domain-subtitle" style="color: white;">Ko-AgentBench의 ALL · L1~L7 단계별 에이전트 성능을 손쉽게 비교하세요.</p>
|
| 1948 |
</div>
|
| 1949 |
""")
|
| 1950 |
domain_filter = gr.Radio(
|
|
|
|
| 1960 |
with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
|
| 1961 |
gr.HTML("""
|
| 1962 |
<div class="domain-header">
|
| 1963 |
+
<h2 class="domain-title" style="color: white;">🔍 Filters & Sorting</h2>
|
| 1964 |
+
<p class="domain-subtitle" style="color: white;">모델 접근 방식과 정렬 순서를 선택해 맞춤 뷰를 구성하세요.</p>
|
| 1965 |
</div>
|
| 1966 |
""")
|
| 1967 |
with gr.Row(elem_classes=["filters-sorting-row"]):
|
| 1968 |
with gr.Column(scale=1, elem_classes=["filter-group"]):
|
| 1969 |
with gr.Row(elem_classes=["filter-group-row"]):
|
| 1970 |
+
gr.HTML("<span class='filter-group-label' style='color: white;'>Model Access</span>")
|
| 1971 |
model_type_filter = gr.Radio(
|
| 1972 |
choices=["All", "OSS", "API"],
|
| 1973 |
value="All",
|
|
|
|
| 1977 |
)
|
| 1978 |
with gr.Column(scale=1, elem_classes=["filter-group"]):
|
| 1979 |
with gr.Row(elem_classes=["filter-group-row"]):
|
| 1980 |
+
gr.HTML("<span class='filter-group-label' style='color: white;'>Sort Order</span>")
|
| 1981 |
sort_order = gr.Radio(
|
| 1982 |
choices=["Descending", "Ascending"],
|
| 1983 |
value="Descending",
|
|
|
|
| 1999 |
gr.HTML("""
|
| 2000 |
<div class="domain-selector-container domain-performance-container">
|
| 2001 |
<div class="domain-header">
|
| 2002 |
+
<h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
|
| 2003 |
+
<p class="domain-subtitle" style="color: white;">Track six essential pillars: Success, Execution, Reasoning, Robustness, Efficiency, and Call Validity.</p>
|
| 2004 |
</div>
|
| 2005 |
""")
|
| 2006 |
|
| 2007 |
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
|
| 2008 |
gr.HTML("""
|
| 2009 |
<div class="domain-header">
|
| 2010 |
+
<h2 class="domain-title" style="color: white;">🎯 Select Models for Comparison</h2>
|
| 2011 |
+
<p class="domain-subtitle" style="color: white;">Choose up to 5 models to map on the capability radar.</p>
|
| 2012 |
</div>
|
| 2013 |
""")
|
| 2014 |
model_selector = gr.Dropdown(
|
|
|
|
| 2039 |
gr.HTML("""
|
| 2040 |
<div class="domain-selector-container domain-performance-container level-metrics-wrapper">
|
| 2041 |
<div class="domain-header">
|
| 2042 |
+
<h2 class="domain-title" style="color: white;">Level-Specific Metric Spotlight</h2>
|
| 2043 |
+
<p class="domain-subtitle" style="color: white;">Dive deeper into each Ko-AgentBench stage and compare model scores across its unique evaluation metrics.</p>
|
| 2044 |
</div>
|
| 2045 |
""")
|
| 2046 |
|
| 2047 |
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
|
| 2048 |
gr.HTML("""
|
| 2049 |
<div class="domain-header">
|
| 2050 |
+
<h2 class="domain-title" style="color: white;">🧭 Select Task Level and Models</h2>
|
| 2051 |
+
<p class="domain-subtitle" style="color: white;">Choose a level and up to 5 models to explore their detailed SR-driven metrics.</p>
|
| 2052 |
</div>
|
| 2053 |
""")
|
| 2054 |
level_metric_selector = gr.Dropdown(
|
|
|
|
| 2085 |
gr.HTML("""
|
| 2086 |
<div class="domain-selector-container domain-performance-container heatmap-wrapper">
|
| 2087 |
<div class="domain-header">
|
| 2088 |
+
<h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
|
| 2089 |
+
<p class="domain-subtitle" style="color: white;">View Ko-AgentBench SR scores across L1~L7 for each model in a single glance.</p>
|
| 2090 |
</div>
|
| 2091 |
<div class="chart-container heatmap-chart-container">
|
| 2092 |
""")
|
|
|
|
| 2451 |
gr.HTML("""
|
| 2452 |
<div class="domain-selector-container performance-card-container">
|
| 2453 |
<div class="domain-header">
|
| 2454 |
+
<h2 class="domain-title" style="color: white;">Model Performance Card</h2>
|
| 2455 |
+
<p class="domain-subtitle" style="color: white;">Comprehensive performance card for any model - perfect for presentations and reports</p>
|
| 2456 |
</div>
|
| 2457 |
<div class="performance-card-content">
|
| 2458 |
""")
|
|
|
|
| 2460 |
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
|
| 2461 |
gr.HTML("""
|
| 2462 |
<div class="domain-header">
|
| 2463 |
+
<h2 class="domain-title" style="color: white;">🤖 Select Model</h2>
|
| 2464 |
+
<p class="domain-subtitle" style="color: white;">비교할 모델을 선택하세요.</p>
|
| 2465 |
</div>
|
| 2466 |
""")
|
| 2467 |
card_model_selector = gr.Dropdown(
|
|
|
|
| 3109 |
hoverlabel=dict(
|
| 3110 |
bgcolor="rgba(1, 9, 26, 0.95)",
|
| 3111 |
bordercolor=colors['line'],
|
| 3112 |
+
font=dict(color="white", size=12, family="'Geist', sans-serif")
|
| 3113 |
)
|
| 3114 |
)
|
| 3115 |
)
|
|
|
|
| 3132 |
ticktext=tick_text,
|
| 3133 |
tickfont=dict(
|
| 3134 |
size=11,
|
| 3135 |
+
color='white',
|
| 3136 |
family="'Geist Mono', monospace"
|
| 3137 |
)
|
| 3138 |
),
|
|
|
|
| 3144 |
tickfont=dict(
|
| 3145 |
size=13,
|
| 3146 |
family="'Geist', sans-serif",
|
| 3147 |
+
color='white',
|
| 3148 |
weight=600
|
| 3149 |
),
|
| 3150 |
rotation=90,
|
|
|
|
| 3158 |
y=-0.15,
|
| 3159 |
xanchor="center",
|
| 3160 |
x=0.5,
|
| 3161 |
+
font=dict(size=12, family="'Geist', sans-serif", color='white'),
|
| 3162 |
bgcolor='rgba(1, 9, 26, 0.8)',
|
| 3163 |
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 3164 |
borderwidth=1,
|
|
|
|
| 3172 |
font=dict(
|
| 3173 |
size=22,
|
| 3174 |
family="'Geist', sans-serif",
|
| 3175 |
+
color="white",
|
| 3176 |
weight=700
|
| 3177 |
),
|
| 3178 |
),
|
|
|
|
| 3271 |
hovertemplate="<b>%{y}</b><br><span style='color:#FFD21E'>%{x}</span><br>SR · %{z:.3f}<extra></extra>",
|
| 3272 |
colorbar=dict(
|
| 3273 |
title="Success Rate",
|
| 3274 |
+
titlefont=dict(color="white", family="'Geist', sans-serif", size=12),
|
| 3275 |
+
tickfont=dict(color="white", family="'Geist', sans-serif", size=10),
|
| 3276 |
thickness=12,
|
| 3277 |
len=0.7,
|
| 3278 |
outlinecolor="rgba(255, 255, 255, 0.1)",
|
|
|
|
| 3309 |
margin=dict(t=80, b=90, l=110, r=160),
|
| 3310 |
height=520,
|
| 3311 |
width=1450,
|
| 3312 |
+
font=dict(family="'Geist', sans-serif", color="white"),
|
| 3313 |
xaxis=dict(
|
| 3314 |
tickangle=-25,
|
| 3315 |
showgrid=False,
|
| 3316 |
ticks="",
|
| 3317 |
+
tickfont=dict(size=11, family="'Geist', sans-serif", color="white")
|
| 3318 |
),
|
| 3319 |
yaxis=dict(
|
| 3320 |
showgrid=False,
|
| 3321 |
ticks="",
|
| 3322 |
+
tickfont=dict(size=12, family="'Geist', sans-serif", color="white")
|
| 3323 |
),
|
| 3324 |
annotations=annotations,
|
| 3325 |
title=dict(
|
|
|
|
| 3329 |
font=dict(
|
| 3330 |
size=20,
|
| 3331 |
family="'Geist', sans-serif",
|
| 3332 |
+
color="white",
|
| 3333 |
weight=700
|
| 3334 |
),
|
| 3335 |
)
|
|
|
|
| 3349 |
xanchor='center', yanchor='middle',
|
| 3350 |
font=dict(
|
| 3351 |
size=18,
|
| 3352 |
+
color="white",
|
| 3353 |
family="'Geist', sans-serif"
|
| 3354 |
),
|
| 3355 |
showarrow=False,
|
|
|
|
| 3372 |
font=dict(
|
| 3373 |
size=20,
|
| 3374 |
family="'Geist', sans-serif",
|
| 3375 |
+
color="white",
|
| 3376 |
weight=700
|
| 3377 |
),
|
| 3378 |
)
|
|
|
|
| 3500 |
bgcolor='rgba(1, 9, 26, 0.75)',
|
| 3501 |
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 3502 |
borderwidth=1,
|
| 3503 |
+
font=dict(size=11, family="'Geist', sans-serif", color='white')
|
| 3504 |
),
|
| 3505 |
xaxis=dict(
|
| 3506 |
+
title=dict(text=f"<b>{level} Metric Score</b>", font=dict(size=14, color="white")),
|
| 3507 |
+
tickfont=dict(size=11, color="white"),
|
| 3508 |
gridcolor='rgba(245, 246, 247, 0.08)',
|
| 3509 |
zerolinecolor='rgba(245, 246, 247, 0.18)',
|
| 3510 |
range=x_range
|
| 3511 |
),
|
| 3512 |
yaxis=dict(
|
| 3513 |
+
tickfont=dict(size=13, color="white"),
|
| 3514 |
automargin=True
|
| 3515 |
),
|
| 3516 |
title=dict(
|
| 3517 |
text=f"<b>{level} Metric Breakdown</b>",
|
| 3518 |
x=0.5,
|
| 3519 |
y=0.98,
|
| 3520 |
+
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
|
| 3521 |
)
|
| 3522 |
)
|
| 3523 |
return fig
|
|
|
|
| 3530 |
xref="paper", yref="paper",
|
| 3531 |
x=0.5, y=0.5,
|
| 3532 |
xanchor='center', yanchor='middle',
|
| 3533 |
+
font=dict(size=18, color="white", family="'Geist', sans-serif"),
|
| 3534 |
showarrow=False,
|
| 3535 |
bgcolor="rgba(245, 246, 247, 0.05)",
|
| 3536 |
bordercolor="rgba(245, 246, 247, 0.2)",
|
|
|
|
| 3547 |
text="<b>Level Metric Breakdown</b>",
|
| 3548 |
x=0.5,
|
| 3549 |
y=0.98,
|
| 3550 |
+
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
|
| 3551 |
)
|
| 3552 |
)
|
| 3553 |
fig.update_xaxes(visible=False)
|
|
|
|
| 3566 |
xanchor='center', yanchor='middle',
|
| 3567 |
font=dict(
|
| 3568 |
size=18,
|
| 3569 |
+
color="white",
|
| 3570 |
family="'Geist', sans-serif"
|
| 3571 |
),
|
| 3572 |
showarrow=False,
|
|
|
|
| 3589 |
font=dict(
|
| 3590 |
size=22,
|
| 3591 |
family="'Geist', sans-serif",
|
| 3592 |
+
color="white",
|
| 3593 |
weight=700
|
| 3594 |
),
|
| 3595 |
),
|
|
|
|
| 3647 |
name=legend_name,
|
| 3648 |
text=df_type['Model'],
|
| 3649 |
textposition="top center",
|
| 3650 |
+
textfont=dict(size=10, color='white'),
|
| 3651 |
marker=dict(
|
| 3652 |
size=df_type['Avg Turns'] * 3, # Size based on number of turns
|
| 3653 |
color=color_map.get(model_type, '#F5F6F7'),
|
|
|
|
| 3671 |
# Add quadrant labels
|
| 3672 |
fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance<br>Low Cost",
|
| 3673 |
showarrow=False, xref="paper", yref="paper",
|
| 3674 |
+
font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)")
|
| 3675 |
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance<br>High Cost",
|
| 3676 |
showarrow=False, xref="paper", yref="paper",
|
| 3677 |
font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
|
|
|
|
| 3683 |
text=f"<b>Cost-Performance Efficiency: {metric_display}</b>",
|
| 3684 |
x=0.5,
|
| 3685 |
y=0.97,
|
| 3686 |
+
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
|
| 3687 |
),
|
| 3688 |
xaxis=dict(
|
| 3689 |
title=dict(
|
| 3690 |
text=f"<b>{metric_display}</b>",
|
| 3691 |
+
font=dict(size=16, color="white")
|
| 3692 |
),
|
| 3693 |
+
tickfont=dict(size=12, color="white"),
|
| 3694 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3695 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3696 |
),
|
| 3697 |
yaxis=dict(
|
| 3698 |
title=dict(
|
| 3699 |
text="<b>Average Session Cost ($)</b>",
|
| 3700 |
+
font=dict(size=16, color="white")
|
| 3701 |
),
|
| 3702 |
+
tickfont=dict(size=12, color="white"),
|
| 3703 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3704 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3705 |
),
|
|
|
|
| 3714 |
y=1.02,
|
| 3715 |
xanchor="right",
|
| 3716 |
x=1,
|
| 3717 |
+
font=dict(size=12, family="'Geist', sans-serif", color='white'),
|
| 3718 |
bgcolor='rgba(1, 9, 26, 0.8)',
|
| 3719 |
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 3720 |
borderwidth=1
|
|
|
|
| 3749 |
mode='markers+text',
|
| 3750 |
text=df_filtered['Model'],
|
| 3751 |
textposition="top center",
|
| 3752 |
+
textfont=dict(size=9, color='white'),
|
| 3753 |
marker=dict(
|
| 3754 |
size=12,
|
| 3755 |
color=df_filtered['Avg Total Cost'],
|
|
|
|
| 3758 |
colorbar=dict(
|
| 3759 |
title=dict(
|
| 3760 |
text="Cost ($)",
|
| 3761 |
+
font=dict(color="white")
|
| 3762 |
),
|
| 3763 |
+
tickfont=dict(color="white"),
|
| 3764 |
bgcolor="rgba(1, 9, 26, 0.8)",
|
| 3765 |
bordercolor="rgba(245, 246, 247, 0.2)",
|
| 3766 |
borderwidth=1,
|
|
|
|
| 3785 |
# Add quadrant labels
|
| 3786 |
fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate",
|
| 3787 |
showarrow=False, xref="paper", yref="paper",
|
| 3788 |
+
font=dict(size=12, color="white", weight=600))
|
| 3789 |
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
|
| 3790 |
showarrow=False, xref="paper", yref="paper",
|
| 3791 |
font=dict(size=12, color="#ffd21e", weight=600))
|
|
|
|
| 3797 |
text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>",
|
| 3798 |
x=0.5,
|
| 3799 |
y=0.97,
|
| 3800 |
+
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
|
| 3801 |
),
|
| 3802 |
xaxis=dict(
|
| 3803 |
title=dict(
|
| 3804 |
text=f"<b>{metric_display}</b>",
|
| 3805 |
+
font=dict(size=16, color="white")
|
| 3806 |
),
|
| 3807 |
+
tickfont=dict(size=12, color="white"),
|
| 3808 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3809 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3810 |
),
|
| 3811 |
yaxis=dict(
|
| 3812 |
title=dict(
|
| 3813 |
text="<b>Average Session Duration (seconds)</b>",
|
| 3814 |
+
font=dict(size=16, color="white")
|
| 3815 |
),
|
| 3816 |
+
tickfont=dict(size=12, color="white"),
|
| 3817 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 3818 |
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 3819 |
),
|
|
|
|
| 3877 |
colorbar=dict(
|
| 3878 |
title=dict(
|
| 3879 |
text="Specialization<br>Strength",
|
| 3880 |
+
font=dict(color="white")
|
| 3881 |
),
|
| 3882 |
+
tickfont=dict(color="white"),
|
| 3883 |
bgcolor="rgba(1, 9, 26, 0.8)",
|
| 3884 |
bordercolor="rgba(245, 246, 247, 0.2)",
|
| 3885 |
borderwidth=1
|
|
|
|
| 3902 |
text=f"<b>Domain Specialization Matrix: {metric_display}</b>",
|
| 3903 |
x=0.5,
|
| 3904 |
y=0.97,
|
| 3905 |
+
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
|
| 3906 |
),
|
| 3907 |
xaxis=dict(
|
| 3908 |
title=dict(
|
| 3909 |
text="<b>Business Domains</b>",
|
| 3910 |
+
font=dict(size=16, color="white")
|
| 3911 |
),
|
| 3912 |
+
tickfont=dict(size=13, color="white"),
|
| 3913 |
gridcolor="rgba(245, 246, 247, 0.1)"
|
| 3914 |
),
|
| 3915 |
yaxis=dict(
|
| 3916 |
title=dict(
|
| 3917 |
text="<b>Models</b>",
|
| 3918 |
+
font=dict(size=16, color="white")
|
| 3919 |
),
|
| 3920 |
+
tickfont=dict(size=11, color="white"),
|
| 3921 |
gridcolor="rgba(245, 246, 247, 0.1)"
|
| 3922 |
),
|
| 3923 |
paper_bgcolor="#01091A",
|
|
|
|
| 4011 |
x=[row['Min'], row['Max']],
|
| 4012 |
y=[row['Domain'], row['Domain']],
|
| 4013 |
mode='markers',
|
| 4014 |
+
marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')),
|
| 4015 |
showlegend=False,
|
| 4016 |
hoverinfo='skip'
|
| 4017 |
))
|
|
|
|
| 4023 |
text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>",
|
| 4024 |
x=0.5,
|
| 4025 |
y=0.97,
|
| 4026 |
+
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
|
| 4027 |
),
|
| 4028 |
xaxis=dict(
|
| 4029 |
title=dict(
|
| 4030 |
text=f"<b>{metric_display} Score</b>",
|
| 4031 |
+
font=dict(size=16, color="white")
|
| 4032 |
),
|
| 4033 |
+
tickfont=dict(size=12, color="white"),
|
| 4034 |
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 4035 |
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
|
| 4036 |
),
|
| 4037 |
yaxis=dict(
|
| 4038 |
title=dict(
|
| 4039 |
text="<b>Business Domain</b>",
|
| 4040 |
+
font=dict(size=16, color="white")
|
| 4041 |
),
|
| 4042 |
+
tickfont=dict(size=13, color="white"),
|
| 4043 |
gridcolor="rgba(245, 246, 247, 0.1)"
|
| 4044 |
),
|
| 4045 |
paper_bgcolor="#01091A",
|
|
|
|
| 4056 |
xref="paper", yref="paper",
|
| 4057 |
x=0.98, y=0.02,
|
| 4058 |
xanchor='right', yanchor='bottom',
|
| 4059 |
+
font=dict(size=12, color='white'),
|
| 4060 |
showarrow=False
|
| 4061 |
)
|
| 4062 |
|
|
|
|
| 4074 |
xanchor='center', yanchor='middle',
|
| 4075 |
font=dict(
|
| 4076 |
size=18,
|
| 4077 |
+
color="white",
|
| 4078 |
family="'Geist', sans-serif"
|
| 4079 |
),
|
| 4080 |
showarrow=False,
|