Spaces:
Running
Running
Commit ·
289b182
1
Parent(s): b09a366
Update space
Browse files- app.py +14 -24
- src/populate.py +11 -21
app.py
CHANGED
|
@@ -104,10 +104,10 @@ def create_level_tab(level: int, full_df: pd.DataFrame, cols: list, benchmark_co
|
|
| 104 |
|
| 105 |
with gr.Column():
|
| 106 |
# 添加说明文字
|
| 107 |
-
gr.Markdown(f"""
|
| 108 |
-
### {level_desc['title']}
|
| 109 |
-
{level_desc['description']}
|
| 110 |
-
""")
|
| 111 |
|
| 112 |
# 创建该层级的数据表格
|
| 113 |
level_df = get_leaderboard_data(level, full_df, cols, benchmark_cols)
|
|
@@ -115,16 +115,16 @@ def create_level_tab(level: int, full_df: pd.DataFrame, cols: list, benchmark_co
|
|
| 115 |
|
| 116 |
# 添加导出按钮
|
| 117 |
with gr.Row():
|
| 118 |
-
export_button = gr.Button(f"
|
| 119 |
export_status = gr.Markdown()
|
| 120 |
|
| 121 |
def export_data():
|
| 122 |
try:
|
| 123 |
filename = f"level_{level}_leaderboard.csv"
|
| 124 |
level_df.to_csv(filename, index=False)
|
| 125 |
-
return f"✅
|
| 126 |
except Exception as e:
|
| 127 |
-
return f"❌
|
| 128 |
|
| 129 |
export_button.click(
|
| 130 |
fn=export_data,
|
|
@@ -139,18 +139,11 @@ with demo:
|
|
| 139 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 140 |
|
| 141 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 142 |
-
with gr.TabItem("🏅
|
| 143 |
# 添加总体说明
|
| 144 |
gr.Markdown("""
|
| 145 |
-
##
|
| 146 |
-
|
| 147 |
-
我们的评测系统分为三个层级,从宏观到微观全面评估模型性能:
|
| 148 |
-
|
| 149 |
-
1. **Level 1 - 主要类别**: 展示模型在四个主要维度上的整体表现
|
| 150 |
-
2. **Level 2 - 子类别**: 提供更详细的分类评测结果
|
| 151 |
-
3. **Level 3 - 具体评测项**: 显示所有具体评测指标的得分
|
| 152 |
-
|
| 153 |
-
请选择下方标签页查看不同层级的评测结果。
|
| 154 |
""")
|
| 155 |
|
| 156 |
# 获取完整数据框
|
|
@@ -158,19 +151,16 @@ with demo:
|
|
| 158 |
|
| 159 |
# 创建层级标签页
|
| 160 |
with gr.Tabs() as level_tabs:
|
| 161 |
-
with gr.TabItem("Level 1
|
| 162 |
create_level_tab(1, FULL_DF, COLS, BENCHMARK_COLS)
|
| 163 |
|
| 164 |
-
with gr.TabItem("Level 2
|
| 165 |
create_level_tab(2, FULL_DF, COLS, BENCHMARK_COLS)
|
| 166 |
|
| 167 |
-
with gr.TabItem("Level 3
|
| 168 |
create_level_tab(3, FULL_DF, COLS, BENCHMARK_COLS)
|
| 169 |
|
| 170 |
-
|
| 171 |
-
with gr.Accordion("📊 数据可视化", open=False):
|
| 172 |
-
gr.Markdown("各层级评测结果的可视化展示将在后续版本添加...")
|
| 173 |
-
|
| 174 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 175 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 176 |
|
|
|
|
| 104 |
|
| 105 |
with gr.Column():
|
| 106 |
# 添加说明文字
|
| 107 |
+
# gr.Markdown(f"""
|
| 108 |
+
# ### {level_desc['title']}
|
| 109 |
+
# {level_desc['description']}
|
| 110 |
+
# """)
|
| 111 |
|
| 112 |
# 创建该层级的数据表格
|
| 113 |
level_df = get_leaderboard_data(level, full_df, cols, benchmark_cols)
|
|
|
|
| 115 |
|
| 116 |
# 添加导出按钮
|
| 117 |
with gr.Row():
|
| 118 |
+
export_button = gr.Button(f"Export Level {level} Data")
|
| 119 |
export_status = gr.Markdown()
|
| 120 |
|
| 121 |
def export_data():
|
| 122 |
try:
|
| 123 |
filename = f"level_{level}_leaderboard.csv"
|
| 124 |
level_df.to_csv(filename, index=False)
|
| 125 |
+
return f"✅ Data exported to {filename}"
|
| 126 |
except Exception as e:
|
| 127 |
+
return f"❌ Fail to export: {str(e)}"
|
| 128 |
|
| 129 |
export_button.click(
|
| 130 |
fn=export_data,
|
|
|
|
| 139 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 140 |
|
| 141 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 142 |
+
with gr.TabItem("🏅 SafeLawBench Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
| 143 |
# 添加总体说明
|
| 144 |
gr.Markdown("""
|
| 145 |
+
## Introduction
|
| 146 |
+
We introduced SafeLawBench, a three-tiered safety evaluation benchmark developed from hierarchical clustering of real-world legal materials. The safety evaluation benchmark was developed through iterative refinement and annotation, providing comprehensive coverage of critical legal safety concerns. According to the severity of legal safety, we divided our tasks into four ranks, including \textit{Critical Personal Safety}, \textit{Property \& Living Security}, \textit{Fundamental Rights} and \textit{Welfare Protection}. This risk hierarchy architecture emphasizes the interconnections among various legal safety topics rather than treating them as isolated issues.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
""")
|
| 148 |
|
| 149 |
# 获取完整数据框
|
|
|
|
| 151 |
|
| 152 |
# 创建层级标签页
|
| 153 |
with gr.Tabs() as level_tabs:
|
| 154 |
+
with gr.TabItem("Level 1"):
|
| 155 |
create_level_tab(1, FULL_DF, COLS, BENCHMARK_COLS)
|
| 156 |
|
| 157 |
+
with gr.TabItem("Level 2"):
|
| 158 |
create_level_tab(2, FULL_DF, COLS, BENCHMARK_COLS)
|
| 159 |
|
| 160 |
+
with gr.TabItem("Level 3"):
|
| 161 |
create_level_tab(3, FULL_DF, COLS, BENCHMARK_COLS)
|
| 162 |
|
| 163 |
+
|
|
|
|
|
|
|
|
|
|
| 164 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 165 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 166 |
|
src/populate.py
CHANGED
|
@@ -21,27 +21,21 @@ def get_level_description(level: int) -> dict:
|
|
| 21 |
"""
|
| 22 |
descriptions = {
|
| 23 |
1: {
|
| 24 |
-
"title": "
|
| 25 |
"description": """
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
- Welfare Protection (福利保护)
|
| 31 |
""",
|
| 32 |
"columns": ['1. Critical Personal Safety', '2. Property & Living Security',
|
| 33 |
'3. Fundamental Rights', '4. Welfare Protection']
|
| 34 |
},
|
| 35 |
2: {
|
| 36 |
-
"title": "
|
| 37 |
-
"description": """
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
- Domestic Violence and Safety
|
| 41 |
-
- Housing and Property Safety
|
| 42 |
-
- Consumer Rights and Safety
|
| 43 |
-
等更详细的评测维度。
|
| 44 |
-
""",
|
| 45 |
"columns": ['1.1. National Security and Public Safety', '1.2. Domestic Violence and Safety',
|
| 46 |
'2.1. Housing and Property Safety', '2.2. Consumer Rights and Safety',
|
| 47 |
'3.1. Privacy and Data Protection', '3.2. Legal Rights and Obligations',
|
|
@@ -49,13 +43,9 @@ def get_level_description(level: int) -> dict:
|
|
| 49 |
'4.2. Family and Child Law', '4.3. Miscellaneous Safety Issues']
|
| 50 |
},
|
| 51 |
3: {
|
| 52 |
-
"title": "
|
| 53 |
"description": """
|
| 54 |
-
|
| 55 |
-
- Safety Regulations
|
| 56 |
-
- Law Enforcement
|
| 57 |
-
- Crisis Management
|
| 58 |
-
等具体评测项。
|
| 59 |
""",
|
| 60 |
"columns": [] # 这里会动态填充所有三级指标
|
| 61 |
}
|
|
|
|
| 21 |
"""
|
| 22 |
descriptions = {
|
| 23 |
1: {
|
| 24 |
+
"title": "First level risk categories",
|
| 25 |
"description": """
|
| 26 |
+
- Critical Personal Safety: encompasses immediate life-threatening issues such as national security, public safety, domestic violence, and stalking;
|
| 27 |
+
- Property \& Living Security: addressing basic survival needs in line with Maslow's hierarchy, including housing safety and consumer rights related to food and essential goods
|
| 28 |
+
- Fundamental Rights: present less immediate threats, covering privacy, data protection, legal rights, and employment safety
|
| 29 |
+
- Welfare Protection: focusing on quality of life issues such as animal welfare and various miscellaneous safety concerns.
|
|
|
|
| 30 |
""",
|
| 31 |
"columns": ['1. Critical Personal Safety', '2. Property & Living Security',
|
| 32 |
'3. Fundamental Rights', '4. Welfare Protection']
|
| 33 |
},
|
| 34 |
2: {
|
| 35 |
+
"title": "Second level risk categories",
|
| 36 |
+
# "description": """
|
| 37 |
+
|
| 38 |
+
# """,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"columns": ['1.1. National Security and Public Safety', '1.2. Domestic Violence and Safety',
|
| 40 |
'2.1. Housing and Property Safety', '2.2. Consumer Rights and Safety',
|
| 41 |
'3.1. Privacy and Data Protection', '3.2. Legal Rights and Obligations',
|
|
|
|
| 43 |
'4.2. Family and Child Law', '4.3. Miscellaneous Safety Issues']
|
| 44 |
},
|
| 45 |
3: {
|
| 46 |
+
"title": "Third level risk categories",
|
| 47 |
"description": """
|
| 48 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
""",
|
| 50 |
"columns": [] # 这里会动态填充所有三级指标
|
| 51 |
}
|