hellokawei commited on
Commit
42d2955
·
verified ·
1 Parent(s): 8082735

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +336 -41
app.py CHANGED
@@ -1,9 +1,14 @@
 
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
 
6
 
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
@@ -13,42 +18,275 @@ from src.about import (
13
  TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
  try:
37
- print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
  )
41
- except Exception:
 
42
  restart_space()
43
  try:
44
- print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
  )
48
- except Exception:
 
49
  restart_space()
50
 
51
 
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
  (
@@ -59,32 +297,84 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
  default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
  ),
71
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
  AutoEvalColumn.params.name,
78
  type="slider",
79
  min=0.01,
80
  max=150,
81
- label="Select the number of parameters (B)",
 
 
 
82
  ),
 
 
83
  ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  ),
86
  ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
@@ -98,17 +388,17 @@ with demo:
98
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
  with gr.Column():
110
  with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
  open=False,
113
  ):
114
  with gr.Row():
@@ -117,9 +407,10 @@ with demo:
117
  headers=EVAL_COLS,
118
  datatype=EVAL_TYPES,
119
  row_count=5,
 
120
  )
121
  with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
  open=False,
124
  ):
125
  with gr.Row():
@@ -128,10 +419,11 @@ with demo:
128
  headers=EVAL_COLS,
129
  datatype=EVAL_TYPES,
130
  row_count=5,
 
131
  )
132
 
133
  with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
  open=False,
136
  ):
137
  with gr.Row():
@@ -140,40 +432,42 @@ with demo:
140
  headers=EVAL_COLS,
141
  datatype=EVAL_TYPES,
142
  row_count=5,
 
143
  )
144
  with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
 
147
  with gr.Row():
148
  with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
151
  model_type = gr.Dropdown(
152
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
  multiselect=False,
155
- value=None,
156
  interactive=True,
157
  )
158
 
159
  with gr.Column():
160
  precision = gr.Dropdown(
161
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
  multiselect=False,
164
  value="float16",
165
  interactive=True,
166
  )
167
  weight_type = gr.Dropdown(
168
  choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
  multiselect=False,
171
  value="Original",
172
  interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
- submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
178
  submit_button.click(
179
  add_new_eval,
@@ -189,7 +483,7 @@ with demo:
189
  )
190
 
191
  with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
194
  value=CITATION_BUTTON_TEXT,
195
  label=CITATION_BUTTON_LABEL,
@@ -199,6 +493,7 @@ with demo:
199
  )
200
 
201
  scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
 
203
  scheduler.start()
204
  demo.queue(default_concurrency_limit=40).launch()
 
1
+ # app.py
2
+
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
+ import os
9
+ import json # 导入 json 和 os 库,用于处理文件
10
 
11
+ # 从现有的 src 导入,这些我们无法修改,但需要继续使用其提供的功能
12
  from src.about import (
13
  CITATION_BUTTON_LABEL,
14
  CITATION_BUTTON_TEXT,
 
18
  TITLE,
19
  )
20
  from src.display.css_html_js import custom_css
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # =====================================================================
23
+ # **重要修改开始:直接在 app.py 中定义 GRACE 相关的类和函数**
24
+ # 我们无法修改 src/display/utils.py 和 src/populate.py
25
+ # 所以在这里重新定义或覆盖部分功能,以添加 GRACE 维度。
26
+ # =====================================================================
27
+
28
+ from enum import Enum
29
+ from typing import NamedTuple, List
30
+
31
+ # 重新定义 Column 类(如果 src/display/utils 中有,这里的定义将优先被 app.py 使用)
32
+ class Column(NamedTuple):
33
+ name: str
34
+ type: str
35
+ displayed_by_default: bool = True
36
+ never_hidden: bool = False
37
+ hidden: bool = False
38
+ filterable: bool = True
39
+
40
+ # 重新定义 AutoEvalColumn,加入 GRACE 维度
41
+ class AutoEvalColumn(Enum):
42
+ # 尽可能复制 src/display/utils.py 中已有的 AutoEvalColumn 定义
43
+ # 但请注意,如果您不知道原始的精确定义,这可能会导致不一致。
44
+ # 这里我将使用一个合理的通用版本,并加入 GRACE 维度。
45
+ # 您需要确保这些列名与您评估结果数据中的列名匹配。
46
+ model = Column("Model", "str", displayed_by_default=True, never_hidden=True)
47
+ model_type = Column("Model type", "str", displayed_by_default=True)
48
+ precision = Column("Precision", "str", displayed_by_default=False)
49
+ params = Column("Params (B)", "number", displayed_by_default=True)
50
+ license = Column("License", "str", displayed_by_default=False)
51
+ still_on_hub = Column("On Hub", "boolean", displayed_by_default=True, hidden=True)
52
+ # ... 您可以尝试从已运行的 Leaderboard 检查元素,推断出其他默认列 ...
53
+ # 例如:
54
+ # dataset = Column("Dataset", "str", displayed_by_default=False)
55
+ # average_score = Column("Average Score", "number", displayed_by_default=True) # 假设有一个总分
56
+
57
+ # GRACE 框架新增列
58
+ generalization_score = Column("G: 泛化性", "number", displayed_by_default=True, filterable=True)
59
+ relevance_score = Column("R: 相关性", "number", displayed_by_default=True, filterable=True)
60
+ artistry_score = Column("A: 创新表现力", "number", displayed_by_default=True, filterable=True)
61
+ consistency_score = Column("C: 一致性", "number", displayed_by_default=True, filterable=True)
62
+ efficiency_score = Column("E: 效率性", "number", displayed_by_default=True, filterable=True)
63
+
64
+ # 重新定义 fields() 函数
65
+ def fields(cls: type) -> List[Column]:
66
+ return [c.value for c in cls if isinstance(c.value, Column)]
67
+
68
+ # 重新定义 ModelType 枚举(选择一个作为焦点,例如 LanguageModeling)
69
+ class ModelType(Enum):
70
+ LanguageModeling = "语言生成模型"
71
+ ImageGeneration = "图像生成模型"
72
+ AudioSynthesis = "音频模型"
73
+ # ... 根据您实际的 src/display/utils.py 或项目需求添加其他类型
74
+ Unknown = "未知" # 保持 Unknown,防止意外
75
+
76
+ def to_str(self, sep: str = " : ") -> str:
77
+ return f"{self.name}{sep}{self.value}"
78
+
79
+ # 重新定义 WeightType 和 Precision 枚举
80
+ class WeightType(Enum):
81
+ Original = NamedTuple("Original", [("name", str)])("Original")
82
+ Lora = NamedTuple("Lora", [("name", str)])("Lora")
83
+ # Add other types if necessary from your original src/display/utils.py
84
+ # Example:
85
+ # Adapter = NamedTuple("Adapter", [("name", str)])("Adapter")
86
+
87
+ class Precision(Enum):
88
+ float16 = NamedTuple("float16", [("name", str)])("float16")
89
+ bfloat16 = NamedTuple("bfloat16", [("name", str)])("bfloat16")
90
+ # Add other types if necessary
91
+ Unknown = NamedTuple("Unknown", [("name", str)])("Unknown")
92
+
93
+
94
+ # 重新定义 COLS, BENCHMARK_COLS, EVAL_COLS, EVAL_TYPES
95
+ # 这些列表现在将使用我们在 app.py 中定义的 AutoEvalColumn
96
+ COLS = fields(AutoEvalColumn) # 所有列,包括 GRACE
97
+ BENCHMARK_COLS = [
98
+ AutoEvalColumn.model.value,
99
+ AutoEvalColumn.params.value,
100
+ AutoEvalColumn.generalization_score.value,
101
+ AutoEvalColumn.relevance_score.value,
102
+ AutoEvalColumn.artistry_score.value,
103
+ AutoEvalColumn.consistency_score.value,
104
+ AutoEvalColumn.efficiency_score.value,
105
+ # ... 其他你想在基准测试中默认显示的列
106
+ ]
107
+ EVAL_COLS = [c.name for c in fields(AutoEvalColumn)] # 评估队列的列名
108
+ EVAL_TYPES = [c.type for c in fields(AutoEvalColumn)] # 评估队列的列类型
109
+
110
+
111
+ # 重新定义 get_leaderboard_df 和 get_evaluation_queue_df 函数
112
+ # 这两个函数现在将直接在 app.py 中处理数据加载和 GRACE 维度的添加。
113
+ # 由于您无法修改 src/populate.py,我们需要在这里实现其功能。
114
+
115
+ def get_leaderboard_df(eval_results_path: str, eval_requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
116
+ """
117
+ 加载评估结果并构建排行榜 DataFrame。
118
+ 此函数现在在 app.py 中定义,以包含 GRACE 分数。
119
+ """
120
+ all_results = []
121
+
122
+ # ============== **重点修改区域:GRACE 分数的数据来源** ==============
123
+ # 您需要根据您实际的评估结果文件格式来读取数据并包含 GRACE 分数。
124
+ # 假设您的评估结果是在 EVAL_RESULTS_PATH 目录下,每个模型的 JSON 文件。
125
+ # 示例路径:EVAL_RESULTS_PATH/model_name/results.json
126
+ if os.path.exists(eval_results_path) and os.path.isdir(eval_results_path):
127
+ for model_dir in os.listdir(eval_results_path):
128
+ model_path = os.path.join(eval_results_path, model_dir)
129
+ if os.path.isdir(model_path):
130
+ # 尝试读取 results.json 或其他命名约定
131
+ results_file = os.path.join(model_path, "results.json")
132
+ if os.path.exists(results_file):
133
+ try:
134
+ with open(results_file, "r", encoding="utf-8") as f:
135
+ data = json.load(f)
136
+ # 确保 data 字典中包含 'generalization_score', 'relevance_score' 等键
137
+ # 如果您的原始结果没有这些键,您需要在外部评估过程生成它们,或在这里进行计算。
138
+ # 这里假设结果文件中直接有这些字段。
139
+ all_results.append(data)
140
+ except json.JSONDecodeError as e:
141
+ print(f"解析 {results_file} 失败: {e}")
142
+ except Exception as e:
143
+ print(f"读取 {results_file} 发生未知错误: {e}")
144
+ else:
145
+ print(f"在 {model_path} 中未找到 results.json。")
146
+ else:
147
+ print(f"评估结果路径不存在或不是目录: {eval_results_path}")
148
+
149
+ # 如果没有实际结果,提供一些模拟数据以便测试和展示 GRACE 维度
150
+ if not all_results:
151
+ print("未找到评估结果,使用模拟数据填充排行榜。")
152
+ all_results = [
153
+ {
154
+ "model": "模拟模型_A",
155
+ "model_type": ModelType.LanguageModeling.to_str(),
156
+ "precision": Precision.float16.value.name,
157
+ "params": 7.0,
158
+ "license": "apache-2.0",
159
+ "still_on_hub": True,
160
+ "generalization_score": 0.85,
161
+ "relevance_score": 0.92,
162
+ "artistry_score": 0.78,
163
+ "consistency_score": 0.88,
164
+ "efficiency_score": 0.95,
165
+ # ... 其他您希望展示的列,确保与 AutoEvalColumn 定义匹配
166
+ },
167
+ {
168
+ "model": "模拟模型_B",
169
+ "model_type": ModelType.LanguageModeling.to_str(),
170
+ "precision": Precision.float16.value.name,
171
+ "params": 13.0,
172
+ "license": "mit",
173
+ "still_on_hub": True,
174
+ "generalization_score": 0.90,
175
+ "relevance_score": 0.88,
176
+ "artistry_score": 0.85,
177
+ "consistency_score": 0.91,
178
+ "efficiency_score": 0.90,
179
+ # ...
180
+ },
181
+ {
182
+ "model": "模拟模型_C_图像",
183
+ "model_type": ModelType.ImageGeneration.to_str(),
184
+ "precision": Precision.bfloat16.value.name,
185
+ "params": 3.0,
186
+ "license": "gpl-3.0",
187
+ "still_on_hub": True,
188
+ "generalization_score": 0.70,
189
+ "relevance_score": 0.75,
190
+ "artistry_score": 0.90,
191
+ "consistency_score": None, # 图像模型可能没有一致性得分
192
+ "efficiency_score": 0.80,
193
+ # ...
194
+ }
195
+ ]
196
+ # =====================================================================
197
+
198
+ if all_results:
199
+ df = pd.DataFrame(all_results)
200
+ else:
201
+ df = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
202
+
203
+ # 确保所有期望的列都存在,如果缺失则填充 None
204
+ expected_cols_names = [c.name for c in cols]
205
+ for col_name in expected_cols_names:
206
+ if col_name not in df.columns:
207
+ df[col_name] = None
208
+
209
+ # 对 DataFrame 进行必要的处理,例如排序
210
+ if AutoEvalColumn.generalization_score.value.name in df.columns and not df[AutoEvalColumn.generalization_score.value.name].isnull().all():
211
+ df = df.sort_values(by=AutoEvalColumn.generalization_score.value.name, ascending=False).reset_index(drop=True)
212
+
213
+ return df
214
+
215
+ def get_evaluation_queue_df(eval_requests_path: str, eval_cols: list):
216
+ """
217
+ 加载评估请求队列数据。此函数现在在 app.py 中定义。
218
+ """
219
+ pending_requests = []
220
+ running_requests = []
221
+ finished_requests = []
222
+
223
+ # 示例:假设请求文件是位于 eval_requests_path 的 jsonl 文件
224
+ if os.path.exists(eval_requests_path) and os.path.isdir(eval_requests_path):
225
+ for filename in os.listdir(eval_requests_path):
226
+ if filename.endswith(".jsonl"): # 或者其他你存储请求的文件格式
227
+ filepath = os.path.join(eval_requests_path, filename)
228
+ try:
229
+ with open(filepath, "r", encoding="utf-8") as f:
230
+ for line in f:
231
+ try:
232
+ request_data = json.loads(line)
233
+ status = request_data.get('status', 'pending') # 假设请求数据中有 'status' 字段
234
+ if status == 'finished':
235
+ finished_requests.append(request_data)
236
+ elif status == 'running':
237
+ running_requests.append(request_data)
238
+ else: # 默认或其他状态归为 pending
239
+ pending_requests.append(request_data)
240
+ except json.JSONDecodeError as e:
241
+ print(f"解析 JSONL 行失败: {line.strip()}, 错误: {e}")
242
+ except Exception as e:
243
+ print(f"读取 {filepath} 失败: {e}")
244
+ else:
245
+ print(f"评估请求路径不存在或不是目录: {eval_requests_path}")
246
+
247
+ # 将列表转换为 DataFrame,并确保列与 eval_cols 匹配
248
+ finished_df = pd.DataFrame(finished_requests, columns=eval_cols) if finished_requests else pd.DataFrame(columns=eval_cols)
249
+ running_df = pd.DataFrame(running_requests, columns=eval_cols) if running_requests else pd.DataFrame(columns=eval_cols)
250
+ pending_df = pd.DataFrame(pending_requests, columns=eval_cols) if pending_requests else pd.DataFrame(columns=eval_cols)
251
+
252
+ return finished_df, running_df, pending_df
253
+
254
+ # =====================================================================
255
+ # **重要修改结束:直接在 app.py 中定义 GRACE 相关的类和函数**
256
+ # =====================================================================
257
+
258
+
259
+ # 继续使用 src.envs 中的 API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
260
+ # 这里我们假设这些环境变量或常量是可以通过某种方式加载的,或者在 Space 设置中配置的。
261
+ # 如果 src.envs 也是无法修改的,且您无法通过环境变量设置这些值,那可能会有问题。
262
+ # 通常在 Hugging Face Space 中,这些值是从环境变量或 Space Secrets 中加载的。
263
+ # 这里我不会重定义它们,假设它们是可用的。
264
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
265
+ from src.submission.submit import add_new_eval # 假设 add_new_eval 也是从 src 导入的
266
 
267
  def restart_space():
268
  API.restart_space(repo_id=REPO_ID)
269
 
270
  ### Space initialisation
271
  try:
272
+ print(f"下载评估请求到: {EVAL_REQUESTS_PATH}")
273
  snapshot_download(
274
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
275
  )
276
+ except Exception as e:
277
+ print(f"下载评估请求失败: {e}")
278
  restart_space()
279
  try:
280
+ print(f"下载评估结果到: {EVAL_RESULTS_PATH}")
281
  snapshot_download(
282
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
283
  )
284
+ except Exception as e:
285
+ print(f"下载评估结果失败: {e}")
286
  restart_space()
287
 
288
 
289
+ # 现在,这些函数调用将使用我们刚刚在 app.py 中定义的版本
290
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
291
 
292
  (
 
297
 
298
  def init_leaderboard(dataframe):
299
  if dataframe is None or dataframe.empty:
300
+ print("Leaderboard DataFrame 为空或 None,初始化空排行榜。")
301
+ return Leaderboard(
302
+ value=pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]), # 提供空但带列名的DataFrame
303
+ datatype=[c.type for c in fields(AutoEvalColumn)],
304
+ select_columns=SelectColumns(
305
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
306
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
307
+ label="选择要显示的列:",
308
+ ),
309
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
310
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
311
+ filter_columns=[], # 如果是空 DataFrame,这里不添加具体的过滤器,避免错误
312
+ bool_checkboxgroup_label="隐藏模型",
313
+ interactive=False,
314
+ )
315
+
316
  return Leaderboard(
317
  value=dataframe,
318
  datatype=[c.type for c in fields(AutoEvalColumn)],
319
  select_columns=SelectColumns(
320
  default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
321
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
322
+ label="选择要显示的列:",
323
  ),
324
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
325
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
326
  filter_columns=[
327
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="模型类型"),
328
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="精度"),
329
  ColumnFilter(
330
  AutoEvalColumn.params.name,
331
  type="slider",
332
  min=0.01,
333
  max=150,
334
+ label="选择参数数量 (B)",
335
+ ),
336
+ ColumnFilter(
337
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="已删除/不完整", default=True
338
  ),
339
+ # 为 GRACE 分数添加筛选器 (滑块)
340
+ # 假设分数在 0.0 到 1.0 之间
341
  ColumnFilter(
342
+ AutoEvalColumn.generalization_score.value.name,
343
+ type="slider",
344
+ min=0.0,
345
+ max=1.0,
346
+ label="G: 泛化性得分",
347
+ ),
348
+ ColumnFilter(
349
+ AutoEvalColumn.relevance_score.value.name,
350
+ type="slider",
351
+ min=0.0,
352
+ max=1.0,
353
+ label="R: 相关性得分",
354
+ ),
355
+ ColumnFilter(
356
+ AutoEvalColumn.artistry_score.value.name,
357
+ type="slider",
358
+ min=0.0,
359
+ max=1.0,
360
+ label="A: 创新表现力得分",
361
+ ),
362
+ ColumnFilter(
363
+ AutoEvalColumn.consistency_score.value.name,
364
+ type="slider",
365
+ min=0.0,
366
+ max=1.0,
367
+ label="C: 一致性得分",
368
+ ),
369
+ ColumnFilter(
370
+ AutoEvalColumn.efficiency_score.value.name,
371
+ type="slider",
372
+ min=0.0,
373
+ max=1.0,
374
+ label="E: 效率性得分",
375
  ),
376
  ],
377
+ bool_checkboxgroup_label="隐藏模型",
378
  interactive=False,
379
  )
380
 
 
388
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
389
  leaderboard = init_leaderboard(LEADERBOARD_DF)
390
 
391
+ with gr.TabItem("📝 关于", elem_id="llm-benchmark-tab-table", id=2):
392
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
393
 
394
+ with gr.TabItem("🚀 在此提交!", elem_id="llm-benchmark-tab-table", id=3):
395
  with gr.Column():
396
  with gr.Row():
397
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
398
 
399
  with gr.Column():
400
  with gr.Accordion(
401
+ f"✅ 已完成评估 ({len(finished_eval_queue_df)})",
402
  open=False,
403
  ):
404
  with gr.Row():
 
407
  headers=EVAL_COLS,
408
  datatype=EVAL_TYPES,
409
  row_count=5,
410
+ label="已完成评估队列",
411
  )
412
  with gr.Accordion(
413
+ f"🔄 正在运行的评估队列 ({len(running_eval_queue_df)})",
414
  open=False,
415
  ):
416
  with gr.Row():
 
419
  headers=EVAL_COLS,
420
  datatype=EVAL_TYPES,
421
  row_count=5,
422
+ label="正在运行的评估队列",
423
  )
424
 
425
  with gr.Accordion(
426
+ f"⏳ 待处理的评估队列 ({len(pending_eval_queue_df)})",
427
  open=False,
428
  ):
429
  with gr.Row():
 
432
  headers=EVAL_COLS,
433
  datatype=EVAL_TYPES,
434
  row_count=5,
435
+ label="待处理的评估队列",
436
  )
437
  with gr.Row():
438
+ gr.Markdown("# ✉️✨ 在此提交您的模型!", elem_classes="markdown-text")
439
 
440
  with gr.Row():
441
  with gr.Column():
442
+ model_name_textbox = gr.Textbox(label="模型名称")
443
+ revision_name_textbox = gr.Textbox(label="修订提交", placeholder="main")
444
+ # 设置模型类型的默认值,以体现项目焦点(例如:语言生成模型)
445
  model_type = gr.Dropdown(
446
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
447
+ label="模型类型",
448
  multiselect=False,
449
+ value=ModelType.LanguageModeling.to_str(" : "), # 示例:聚焦于语言生成模型
450
  interactive=True,
451
  )
452
 
453
  with gr.Column():
454
  precision = gr.Dropdown(
455
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
456
+ label="精度",
457
  multiselect=False,
458
  value="float16",
459
  interactive=True,
460
  )
461
  weight_type = gr.Dropdown(
462
  choices=[i.value.name for i in WeightType],
463
+ label="权重类型",
464
  multiselect=False,
465
  value="Original",
466
  interactive=True,
467
  )
468
+ base_model_name_textbox = gr.Textbox(label="基础模型 (适用于 delta adapter 权重)")
469
 
470
+ submit_button = gr.Button("提交评估")
471
  submission_result = gr.Markdown()
472
  submit_button.click(
473
  add_new_eval,
 
483
  )
484
 
485
  with gr.Row():
486
+ with gr.Accordion("📙 引用", open=False):
487
  citation_button = gr.Textbox(
488
  value=CITATION_BUTTON_TEXT,
489
  label=CITATION_BUTTON_LABEL,
 
493
  )
494
 
495
  scheduler = BackgroundScheduler()
496
+ # 30 分钟重启一次 Space,确保数据刷新
497
+ scheduler.add_job(restart_space, "interval", seconds=1800)
498
  scheduler.start()
499
  demo.queue(default_concurrency_limit=40).launch()