Spaces:

huggingface-KREW
/

Ko-AgentBench

Running on CPU Upgrade

App Files Files Community

xhaktm00 commited on Oct 28

Commit

6390f61

1 Parent(s): 23cd914

한글 텍스트 수정

Browse files

Files changed (1) hide show

tabs/leaderboard_v1.py +61 -42

tabs/leaderboard_v1.py CHANGED Viewed

@@ -225,7 +225,7 @@ def create_leaderboard_v2_tab():
     level_details = {
         "ALL": {
             "title": "ALL · 전체 태스크",
-            "description": "7개의 태스크 전반의 평균 성능을 한눈에 살펴보고 각 레벨 비교를 위한 기준점을 제공합니다."
         },
         "L1": {
             "title": "<span style='color: white;'>L1 · 단일 도구 실행</span>",
@@ -237,7 +237,7 @@ def create_leaderboard_v2_tab():
         },
         "L3": {
             "title": "<span style='color: white;'>L3 · 순차적 추론 (Chaining)</span>",
-            "description": "<span style='color: white;'>복수 단계의 순차적 reasoning을 통해 문제를 해결하는 과정을 검증합니다.</span>"
         },
         "L4": {
             "title": "<span style='color: white;'>L4 · 병렬적 추론 (Aggregation)</span>",
@@ -380,7 +380,7 @@ def create_leaderboard_v2_tab():
         if overall_highlight:
             overall_header_classes.append("highlight-header")
         table_html += f"""
-                        <th class="{' '.join(overall_header_classes)}" title="Average success rate across all levels">
                             <span class="metric-header">Overall <span class="info-icon">ⓘ</span></span>
                         </th>
         """
@@ -389,7 +389,7 @@ def create_leaderboard_v2_tab():
             if highlight_map.get(level):
                 header_classes.append("highlight-header")
             table_html += f"""
-                        <th class="{' '.join(header_classes)}" title="Average success rate for {level}">
                             <span class="metric-header">{level} <span class="info-icon">ⓘ</span></span>
                         </th>
             """
@@ -896,7 +896,7 @@ def create_leaderboard_v2_tab():
     .section-title {
         font-size: 2.2rem;
-        font-weight: 700;
         color: var(--text-primary);
         margin-bottom: 12px;
         text-align: center !important;
@@ -905,7 +905,7 @@ def create_leaderboard_v2_tab():
     .section-lead, .section-subtitle {
         font-size: 1.1rem;
         color: var(--text-secondary);
-        max-width: 720px;
         margin: 0 auto 24px auto;
         line-height: 1.7;
         text-align: center !important;
@@ -983,7 +983,7 @@ def create_leaderboard_v2_tab():
     }
     .scenario-body {
-        max-width: 760px;
         margin: 0 auto;
         text-align: center;
     }
@@ -1091,7 +1091,7 @@ def create_leaderboard_v2_tab():
     gr.HTML("""
     <div style="text-align: center; padding: 20px 0;">
         <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
-        <p class="hero-subtitle">한국 실사용 환경 특화 에이전트 벤치마크</p>
     </div>
     """)
@@ -1138,7 +1138,7 @@ def create_leaderboard_v2_tab():
         <div class="section-header">
             <h2 class="section-title">단계별 태스크 설계</h2>
         </div>
-        <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">단순 도구 호출부터 장기적 맥락 능력, 강건성 처리 능력까지 에이전트의 능력을 7단계로 입체적으로 분석하였습니다.</p>
         <div class="phase-grid">
             <div class="phase-card">
                 <h3>Single-Turn</h3>
@@ -1148,9 +1148,9 @@ def create_leaderboard_v2_tab():
                 <ul class="phase-list">
                     <li style="color: white;">L1: 단일 도구 실행</li>
                     <li style="color: white;">L2: 도구 선택 능력</li>
-                    <li style="color: white;">L3: 순차적 reasoning (Chaining)</li>
-                    <li style="color: white;">L4: 병렬적 reasoning (Aggregation)</li>
-                    <li style="color: white;">L5: 강건성 (Robustness / Fallback)</li>
                 </ul>
             </div>
             <div class="phase-card">
@@ -1159,8 +1159,8 @@ def create_leaderboard_v2_tab():
                     <span>20%</span>
                 </div>
                 <ul class="phase-list">
-                    <li style="color: white;">L6: 효율성 (Efficiency)</li>
-                    <li style="color: white;">L7: 장기 컨텍스트 기억 (Contextual Memory)</li>
                 </ul>
             </div>
         </div>
@@ -1171,10 +1171,11 @@ def create_leaderboard_v2_tab():
     gr.HTML("""
     <div class="dashboard-section emphasized">
         <div class="section-header">
-            <h2 class="section-title">18가지 한국형 API 사용 및 실생활 환경에 특화된 고품질 시나리오 구성</h2>
         </div>
         <div class="scenario-body">
-            <p>네이버, 지도, 카카오, 웹사이트 등 한국 실사용 환경 기반의 API를 기반으로 국내 사용자의 일상과 밀접한 '약속 예약', '블로그 후기 검색'과 같은 현실적인 문제 해결 시나리오를 구현했습니다.</p>
         </div>
         <div class="section-flow">⌄</div>
     </div>
@@ -1188,25 +1189,25 @@ def create_leaderboard_v2_tab():
         </div>
         <div class="criteria-grid">
             <div class="criteria-card">
-                <h3>핵심 기반 반복 평가</h3>
                 <ul>
-                    <li>실패 API 응답 개선</li>
-                    <li>'정보 속성 불일치성 변경' 등 기존 벤치마크의 고질적 문제 해결</li>
                     <li>벤치마크의 일관성과 신뢰도 보장</li>
                 </ul>
             </div>
             <div class="criteria-card">
                 <h3>강건성 테스트</h3>
                 <ul>
-                    <li>의도된 오류 상황(상품 단종)의 오류 인식/대응 능력(전략)까지 평가</li>
                     <li>현실 환경에서도 안정적으로 작동하는 모델 선별</li>
                 </ul>
             </div>
             <div class="criteria-card">
                 <h3>단계별 고유 정밀 지표</h3>
                 <ul>
-                    <li>도구 선택, 파라미터 구성, 데이터 흐름 등 문제 해결의 불필요/소요 단계별 평가</li>
-                    <li>모델의 강/약점 정량적으로 식별</li>
                 </ul>
             </div>
         </div>
@@ -1264,10 +1265,18 @@ def create_leaderboard_v2_tab():
     .domain-performance-container .domain-subtitle {
         font-size: 1.05rem;
-        max-width: 720px;
         margin: 0 auto;
     }
     .leaderboard-intro .domain-title,
     .domain-performance-container > .domain-header .domain-title,
     .performance-card-container > .domain-header .domain-title {
@@ -1296,7 +1305,7 @@ def create_leaderboard_v2_tab():
     .performance-card-container .domain-subtitle {
         font-size: 1.05rem;
-        max-width: 720px;
         margin: 0 auto;
     }
@@ -1962,7 +1971,7 @@ def create_leaderboard_v2_tab():
         gr.HTML("""
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">🔍 Filters & Sorting</h2>
-            <p class="domain-subtitle" style="color: white;">모델 접근 방식과 정렬 순서를 선택해 맞춤 뷰를 구성하세요.</p>
         </div>
         """)
         with gr.Row(elem_classes=["filters-sorting-row"]):
@@ -2001,7 +2010,8 @@ def create_leaderboard_v2_tab():
     <div class="domain-selector-container domain-performance-container">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
-            <p class="domain-subtitle" style="color: white;">Track six essential pillars: Success, Execution, Reasoning, Robustness, Efficiency, and Call Validity.</p>
         </div>
     """)
@@ -2009,7 +2019,7 @@ def create_leaderboard_v2_tab():
         gr.HTML("""
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">🎯 Select Models for Comparison</h2>
-            <p class="domain-subtitle" style="color: white;">Choose up to 5 models to map on the capability radar.</p>
         </div>
         """)
         model_selector = gr.Dropdown(
@@ -2041,7 +2051,7 @@ def create_leaderboard_v2_tab():
     <div class="domain-selector-container domain-performance-container level-metrics-wrapper">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Level-Specific Metric Spotlight</h2>
-            <p class="domain-subtitle" style="color: white;">Dive deeper into each Ko-AgentBench stage and compare model scores across its unique evaluation metrics.</p>
         </div>
     """)
@@ -2049,7 +2059,8 @@ def create_leaderboard_v2_tab():
         gr.HTML("""
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">🧭 Select Task Level and Models</h2>
-            <p class="domain-subtitle" style="color: white;">Choose a level and up to 5 models to explore their detailed SR-driven metrics.</p>
         </div>
         """)
         level_metric_selector = gr.Dropdown(
@@ -2087,7 +2098,7 @@ def create_leaderboard_v2_tab():
     <div class="domain-selector-container domain-performance-container heatmap-wrapper">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
-            <p class="domain-subtitle" style="color: white;">View Ko-AgentBench SR scores across L1~L7 for each model in a single glance.</p>
         </div>
         <div class="chart-container heatmap-chart-container">
     """)
@@ -2340,12 +2351,13 @@ def create_leaderboard_v2_tab():
                 return 'N/A'
             return f"{float(value):.3f}"
         radar_metrics = [
             ("기초 수행력", row.get('Execution Accuracy')),
-            ("복합 추론력", row.get('Complex Reasoning')),
-            ("견고성", row.get('Robustness')),
             ("맥락 효율성", row.get('Context & Efficiency')),
             ("전반적 성공률", row.get('Overall Success')),
             ("기본적 유효성", row.get('Call Validity')),
         ]
         radar_values = []
@@ -2401,7 +2413,7 @@ def create_leaderboard_v2_tab():
                     <div class="core-section">
                         <div class="core-metric-grid">
         """
-        ordered_labels = ["기초 수행력", "복합 추론력", "견고성", "맥락 효율성", "전반적 성공률", "기본적 유효성"]
         ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
         top_metrics = ordered_metrics[:3]
         bottom_metrics = ordered_metrics[3:]
@@ -2453,7 +2465,12 @@ def create_leaderboard_v2_tab():
     <div class="domain-selector-container performance-card-container">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Model Performance Card</h2>
-            <p class="domain-subtitle" style="color: white;">Comprehensive performance card for any model - perfect for presentations and reports</p>
         </div>
         <div class="performance-card-content">
     """)
@@ -2474,7 +2491,7 @@ def create_leaderboard_v2_tab():
             elem_classes=["model-dropdown"]
         )
         download_card_btn = gr.Button(
-            "Download Card as PNG",
             elem_id="download-card-btn",
             elem_classes=["pill-button"]
         )
@@ -3022,13 +3039,15 @@ def create_leaderboard_v2_interface():
 def create_domain_radar_chart(df, selected_models=None, max_models=5):
     """Visualize six core capability metrics on a radar chart."""
     df = df.copy()
     metrics_info = [
-        {"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
-        {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
-        {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
-        {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
-        {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
-        {"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
     ]
     required_columns = [m["column"] for m in metrics_info]
@@ -3185,7 +3204,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
         autosize=True,
         annotations=[
             dict(
-                text="Galileo Agent Leaderboard",
                 xref="paper", yref="paper",
                 x=0.98, y=0.02,
                 xanchor='right', yanchor='bottom',

     level_details = {
         "ALL": {
             "title": "ALL · 전체 태스크",
+            "description": "L1~L7 단계의 평균 SR을 통해 모델의 전반적 성능 수준과 단계별 강점을 비교할 수 있습니다."
         },
         "L1": {
             "title": "<span style='color: white;'>L1 · 단일 도구 실행</span>",
         },
         "L3": {
             "title": "<span style='color: white;'>L3 · 순차적 추론 (Chaining)</span>",
+            "description": "<span style='color: white;'>복수 단계의 순차적 추론을 통해 문제를 해결하는 과정을 검증합니다.</span>"
         },
         "L4": {
             "title": "<span style='color: white;'>L4 · 병렬적 추론 (Aggregation)</span>",
         if overall_highlight:
             overall_header_classes.append("highlight-header")
         table_html += f"""
+                        <th class="{' '.join(overall_header_classes)}" title="L1~L7 모든 단계의 평균 성공률">
                             <span class="metric-header">Overall <span class="info-icon">ⓘ</span></span>
                         </th>
         """
             if highlight_map.get(level):
                 header_classes.append("highlight-header")
             table_html += f"""
+                        <th class="{' '.join(header_classes)}" title="평균 성공률 {level}">
                             <span class="metric-header">{level} <span class="info-icon">ⓘ</span></span>
                         </th>
             """
     .section-title {
         font-size: 2.2rem;
+        font-weight: 1000;
         color: var(--text-primary);
         margin-bottom: 12px;
         text-align: center !important;
     .section-lead, .section-subtitle {
         font-size: 1.1rem;
         color: var(--text-secondary);
+        max-width: 1500px;
         margin: 0 auto 24px auto;
         line-height: 1.7;
         text-align: center !important;
     }
     .scenario-body {
+        max-width: 1200px;
         margin: 0 auto;
         text-align: center;
     }
     gr.HTML("""
     <div style="text-align: center; padding: 20px 0;">
         <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
+        <p class="hero-subtitle">한국 서비스 환경 특화 에이전트 벤치마크</p>
     </div>
     """)
         <div class="section-header">
             <h2 class="section-title">단계별 태스크 설계</h2>
         </div>
+        <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 1000px; line-height: 1.7; word-break: keep-all;">단순 도구 호출부터 장기적 맥락 이해와 강건성 처리까지, 에이전트의 능력을 7단계 구분하여 입체적으로 분석했습니다.</p>
         <div class="phase-grid">
             <div class="phase-card">
                 <h3>Single-Turn</h3>
                 <ul class="phase-list">
                     <li style="color: white;">L1: 단일 도구 실행</li>
                     <li style="color: white;">L2: 도구 선택 능력</li>
+                    <li style="color: white;">L3: 순차적 추론</li>
+                    <li style="color: white;">L4: 병렬적 추론</li>
+                    <li style="color: white;">L5: 강건성</li>
                 </ul>
             </div>
             <div class="phase-card">
                     <span>20%</span>
                 </div>
                 <ul class="phase-list">
+                    <li style="color: white;">L6: 효율성</li>
+                    <li style="color: white;">L7: 장기 컨텍스트 기억</li>
                 </ul>
             </div>
         </div>
     gr.HTML("""
     <div class="dashboard-section emphasized">
         <div class="section-header">
+            <h2 class="section-title">국내 환경에 최적화된 18종 API를 활용한 실생활 시나리오 설계</h2>
         </div>
         <div class="scenario-body">
+            <p>네이버 지도, 카카오 등 국내 주요 서비스 API를 연동하여,
+‘약속 예약’, ‘블로그 후기 검색’ 등 실생활 밀착형 문제 해결 시나리오를 설계했습니다.</p>
         </div>
         <div class="section-flow">⌄</div>
     </div>
         </div>
         <div class="criteria-grid">
             <div class="criteria-card">
+                <h3>캐싱 기반 반복 평가</h3>
                 <ul>
+                    <li>실제 API 응답 캐싱</li>
+                    <li>'외부 API 불안정성, 정보 속성 불일치' 등 기존 벤치마크의 고질적 문제 해결</li>
                     <li>벤치마크의 일관성과 신뢰도 보장</li>
                 </ul>
             </div>
             <div class="criteria-card">
                 <h3>강건성 테스트</h3>
                 <ul>
+                    <li>의도된 오류 상황(상품 단종)의 오류 인식/대응 능력(전략) 평가</li>
                     <li>현실 환경에서도 안정적으로 작동하는 모델 선별</li>
                 </ul>
             </div>
             <div class="criteria-card">
                 <h3>단계별 고유 정밀 지표</h3>
                 <ul>
+                    <li>도구 선택, 파라미터 구성, 데이터 처리 흐름 등 각 단계의 문제 해결 효율성 평가</li>
+                    <li>모델의 강·약점 정량적으로 식별</li>
                 </ul>
             </div>
         </div>
     .domain-performance-container .domain-subtitle {
         font-size: 1.05rem;
+        max-width: 1000px;
+        margin: 0 auto;
+    }
+    .domain-performance-container .domain-subtitle_ {
+        font-size: 1.07rem;
+        max-width: 1000px;
         margin: 0 auto;
+        color: #bdbdbd;
     }
     .leaderboard-intro .domain-title,
     .domain-performance-container > .domain-header .domain-title,
     .performance-card-container > .domain-header .domain-title {
     .performance-card-container .domain-subtitle {
         font-size: 1.05rem;
+        max-width: 1000px;
         margin: 0 auto;
     }
         gr.HTML("""
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">🔍 Filters & Sorting</h2>
+            <p class="domain-subtitle" style="color: white;">모델 유형과 정렬 기준을 선택해 원하는 방식으로 결과를 살펴보세요.</p>
         </div>
         """)
         with gr.Row(elem_classes=["filters-sorting-row"]):
     <div class="domain-selector-container domain-performance-container">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
+            <p class="domain-subtitle_">#기초 수행력 #복합 추론력 #견고성 #맥락 효율성 #전반적 성공률 #기본적 유효성</p>
+            <p class="domain-subtitle" style="color: white;">6대 핵심 역량을 통해 모델의 수행 능력과 균형도를 분석하세요.</p>
         </div>
     """)
         gr.HTML("""
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">🎯 Select Models for Comparison</h2>
+            <p class="domain-subtitle" style="color: white;">레이더 차트에서 비교할 모델을 선택하세요.</p>
         </div>
         """)
         model_selector = gr.Dropdown(
     <div class="domain-selector-container domain-performance-container level-metrics-wrapper">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Level-Specific Metric Spotlight</h2>
+            <p class="domain-subtitle" style="color: white;">L1~L7 단계별로 고유한 평가 지표를 기준으로 모델 점수를 비교해보세요.</p>
         </div>
     """)
         gr.HTML("""
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">🧭 Select Task Level and Models</h2>
+            <p class="domain-subtitle" style="color: white;">L1~L7 단계와 모델을 선택해, SR 기반의 세부 지표를 탐색해보세요.
+</p>
         </div>
         """)
         level_metric_selector = gr.Dropdown(
     <div class="domain-selector-container domain-performance-container heatmap-wrapper">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
+            <p class="domain-subtitle" style="color: white;">각 모델의 L1~L7 단계별 SR 점수를 한눈에 확인할 수 있는 종합 성능 히트맵을 살펴보세요.</p>
         </div>
         <div class="chart-container heatmap-chart-container">
     """)
                 return 'N/A'
             return f"{float(value):.3f}"
+        # Use the same order as the domain radar but keep '견고성' (Robustness) last
         radar_metrics = [
             ("기초 수행력", row.get('Execution Accuracy')),
             ("맥락 효율성", row.get('Context & Efficiency')),
             ("전반적 성공률", row.get('Overall Success')),
+            ("견고성", row.get('Robustness')),
+            ("복합 추론력", row.get('Complex Reasoning')),
             ("기본적 유효성", row.get('Call Validity')),
         ]
         radar_values = []
                     <div class="core-section">
                         <div class="core-metric-grid">
         """
+        ordered_labels = ["기초 수행력", "맥락 효율성", "전반적 성공률", "견고성", "복합 추론력", "기본적 유효성"]
         ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
         top_metrics = ordered_metrics[:3]
         bottom_metrics = ordered_metrics[3:]
     <div class="domain-selector-container performance-card-container">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Model Performance Card</h2>
+            <p class="domain-subtitle" style="color: white;">
+                모델의 성능 스펙트럼을 6대 핵심 지표와 L1~L7 단계�� 종합 성공률(SR)로 시각화한 정밀 분석 카드를 확인해보세요.
+            </p>
+            <p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
+                ※ Rank는 L1~L7 단계별 SR의 평균값을 기준으로 산정되었습니다.
+            </p>
         </div>
         <div class="performance-card-content">
     """)
             elem_classes=["model-dropdown"]
         )
         download_card_btn = gr.Button(
+            "PNG로 다운로드",
             elem_id="download-card-btn",
             elem_classes=["pill-button"]
         )
 def create_domain_radar_chart(df, selected_models=None, max_models=5):
     """Visualize six core capability metrics on a radar chart."""
     df = df.copy()
+    # Use the same metric order and Korean labels as the model performance card
+    # Match the model card order but place '견고성' (Robustness) last as requested
     metrics_info = [
+        {"column": "Execution Accuracy", "label": "기초 수행력", "description": "CallEM · ArgAcc · SelectAcc"},
+        {"column": "Context & Efficiency", "label": "맥락 효율성", "description": "ReuseRate · EffScore · ContextRetention"},
+        {"column": "Overall Success", "label": "전반적 성공률", "description": "L1~L7의 평균 성공률"},
+        {"column": "Robustness", "label": "견고성", "description": "AdaptiveRouting · FallbackSR"},
+        {"column": "Complex Reasoning", "label": "복합 추론력", "description": "ProvAcc · PSM · Coverage"},
+        {"column": "Call Validity", "label": "기본적 유효성", "description": "레벨별 EPR_CVR 평균"},
     ]
     required_columns = [m["column"] for m in metrics_info]
         autosize=True,
         annotations=[
             dict(
+                text="Ko-Agent Leaderboard",
                 xref="paper", yref="paper",
                 x=0.98, y=0.02,
                 xanchor='right', yanchor='bottom',