czyPL commited on
Commit
20fb95e
·
1 Parent(s): 1e82f18

change language

Browse files
Files changed (1) hide show
  1. app.py +172 -167
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- LongBenchmark 结果可视化
5
  """
6
 
7
  import json
@@ -15,21 +15,21 @@ with open('./results/model_info.json', 'r', encoding='utf-8') as f:
15
  MODLE_INFO_DICT = json.load(f)
16
 
17
  def get_color(index):
18
- """基于索引生成颜色,使用黄金角度确保颜色分布均匀且无限"""
19
- # 黄金角度约 137.508 度,确保颜色在色环上分布均匀
20
  hue = (index * 137.508) % 360
21
- # 固定饱和度为70%,亮度为60%,确保颜色既鲜艳又不刺眼
22
  return f"hsl({hue}, 70%, 60%)"
23
 
24
- # 自定义CSS
25
  CUSTOM_CSS = """
26
- /* 强制标题居中 */
27
  h1 {
28
  text-align: center;
29
  display: block;
30
  }
31
 
32
- /* 表头居中 */
33
  #leaderboard_table th,
34
  #leaderboard_table th button,
35
  #leaderboard_table th span {
@@ -37,10 +37,15 @@ h1 {
37
  justify-content: center !important;
38
  }
39
 
40
- /* 内容列居中:从第3列开始(跳过行号和模型名称) */
41
  #leaderboard_table td:nth-child(n+3) {
42
  text-align: center !important;
43
  }
 
 
 
 
 
44
  """
45
 
46
  class ResultParser:
@@ -49,30 +54,30 @@ class ResultParser:
49
  self.results = []
50
 
51
  def parse_filename(self, filename: str):
52
- """解析文件名,提取context长度和是否包含thinking或nonthinking"""
53
- # 提取context长度
54
  context_match = re.search(r'context-(\d+)', filename)
55
  context_length = int(context_match.group(1)) if context_match else 0
56
 
57
  filename_lower = filename.lower()
58
- # 检查是否包含nonthinking(优先检查,因为nonthinking也包含thinking)
59
  has_nonthinking = 'nonthinking' in filename_lower
60
- # 检查是否包含thinking(但不包含nonthinking)
61
  has_thinking = 'thinking' in filename_lower and not has_nonthinking
62
 
63
  return context_length, has_thinking, has_nonthinking
64
 
65
  def parse_result_file(self, model_name: str, file_path: Path):
66
- """解析单个结果文件"""
67
  try:
68
  with open(file_path, 'r', encoding='utf-8') as f:
69
  data = json.load(f)
70
 
71
  context_length, has_thinking, has_nonthinking = self.parse_filename(file_path.name)
72
- # 使用JSON文件中的date字段作为评估日期
73
- eval_date = data.get('date', "未知")
74
 
75
- # 提取BoN数据
76
  bon_data = {}
77
  for bon_key in ['BoN-1', 'BoN-2', 'BoN-3']:
78
  if bon_key in data and 'overall_metric' in data[bon_key]:
@@ -90,7 +95,7 @@ class ResultParser:
90
  'difficulty': data.get('average_difficulty_metric', {}),
91
  'primary_task': data.get('average_primary_task_metric', {}),
92
  'language': data.get('average_language_metric', {}),
93
- 'bon_data': bon_data, # 存储BoN-1, BoN-2, BoN-3overall_metric
94
  'pass_at_k': {
95
  'pass@1': data.get('pass@1'),
96
  'pass@2': data.get('pass@2'),
@@ -101,40 +106,40 @@ class ResultParser:
101
  return result
102
 
103
  except Exception as e:
104
- print(f"解析文件 {file_path} 时出错: {e}")
105
  return None
106
 
107
  def scan_all_results(self):
108
- """扫描所有模型的结果文件"""
109
  self.results = []
110
 
111
  if not self.output_dir.exists():
112
- print(f"输出目录不存在: {self.output_dir}")
113
  return
114
 
115
- # 遍历所有模型目录
116
  for model_dir in self.output_dir.iterdir():
117
  if not model_dir.is_dir():
118
  continue
119
 
120
  model_name = model_dir.name
121
- print(f"扫描模型: {model_name}")
122
 
123
- # 查找该模型下的所有_summary.json文件
124
  for file_path in model_dir.glob("*_summary.json"):
125
- print(f" 解析文件: {file_path.name}")
126
  result = self.parse_result_file(model_name, file_path)
127
  if result:
128
  self.results.append(result)
129
 
130
- print(f"总共解析了 {len(self.results)} 个结果文件")
131
 
132
  def get_leaderboard_data(self):
133
- """获取排行榜数据"""
134
  if not self.results:
135
  return pd.DataFrame()
136
 
137
- # 按模型名称聚合数据
138
  model_groups = {}
139
  for result in self.results:
140
  model_name = result['model_name']
@@ -158,14 +163,14 @@ class ResultParser:
158
 
159
  leaderboard_data = []
160
  for model_name, group in model_groups.items():
161
- # 获取最新日期
162
- valid_dates = [d for d in group['dates'] if d != "未知"]
163
- latest_date = max(valid_dates) if valid_dates else "未知"
164
 
165
- # 获取最大Context Window
166
  max_context = max(group['contexts']) if group['contexts'] else 0
167
 
168
- # 格式化截断长度
169
  if max_context >= 1000000:
170
  context_str = f"{max_context/1000000:.0f}M" if max_context % 1000000 == 0 else f"{max_context/1000000:.1f}M"
171
  elif max_context >= 1000:
@@ -173,7 +178,7 @@ class ResultParser:
173
  else:
174
  context_str = str(max_context)
175
 
176
- # 获取模型类型和上下文长度
177
  model_context = "-"
178
  model_url = ""
179
  if model_name in MODLE_INFO_DICT:
@@ -187,13 +192,13 @@ class ResultParser:
187
  else:
188
  model_type = "Unknown"
189
 
190
- # 处理模型名称链接和图标
191
  display_model_name = model_name
192
 
193
  if model_url:
194
  display_model_name = f"[{display_model_name}]({model_url})"
195
 
196
- # 计算平均分
197
  nt_score_val = 0
198
  nt_score_str = "-"
199
  if group['non_thinking_scores']:
@@ -207,24 +212,24 @@ class ResultParser:
207
  t_score_str = f"{t_score_val * 100:.2f}"
208
 
209
  leaderboard_data.append({
210
- '模型名称': display_model_name,
211
- '模型类型': model_type,
212
- '上下文长度': model_context,
213
- '截断长度': context_str,
214
- '非思考得分': nt_score_str,
215
- '思考得分': t_score_str,
216
  '_sort_score': max(nt_score_val, t_score_val)
217
  })
218
 
219
  df = pd.DataFrame(leaderboard_data)
220
- # 按最高分降序排列
221
  if not df.empty:
222
  df = df.sort_values('_sort_score', ascending=False).drop(columns=['_sort_score']).reset_index(drop=True)
223
 
224
  return df
225
 
226
  def get_display_name_for_result(result):
227
- """获取模型的显示名称(根据是否包含thinkingnonthinking添加后缀)"""
228
  if result.get('has_nonthinking'):
229
  return f"{result['model_name']}_nonthinking"
230
  elif result.get('has_thinking'):
@@ -233,18 +238,18 @@ def get_display_name_for_result(result):
233
  return result['model_name']
234
 
235
  def get_model_color_index(model_name, all_models):
236
- """获取模型在颜色列表中的索引"""
237
  try:
238
  return all_models.index(model_name)
239
  except ValueError:
240
  return 0
241
 
242
  def create_contextual_requirement_chart(results, selected_models):
243
- """创建上下文需求对比柱状图"""
244
  if not selected_models:
245
  return go.Figure()
246
 
247
- # 收集数据 - 直接使用summary中的值,不需要计算平均值
248
  chart_data = {}
249
 
250
  for result in results:
@@ -253,17 +258,17 @@ def create_contextual_requirement_chart(results, selected_models):
253
  model_name = display_name
254
  contextual_requirement = result['contextual_requirement']
255
 
256
- # 直接存储每个模型的结果,不需要计算平均值
257
  if model_name not in chart_data:
258
  chart_data[model_name] = {}
259
 
260
  for req_type, score in contextual_requirement.items():
261
- chart_data[model_name][req_type] = score * 100 # 乘以100
262
 
263
- # 创建图表
264
  fig = go.Figure()
265
 
266
- # 获取所有需求类型 - 保持原始顺序,不排序
267
  all_req_types = []
268
  for result in results:
269
  display_name = get_display_name_for_result(result)
@@ -283,36 +288,36 @@ def create_contextual_requirement_chart(results, selected_models):
283
  x=all_req_types,
284
  y=scores,
285
  marker_color=get_color(color_index),
286
- text=[f"{score:.2f}" for score in scores], # 保留2位小数
287
  textposition='auto'
288
  ))
289
 
290
  fig.update_layout(
291
- title='模型在不同上下文需求上的性能对比',
292
- xaxis_title='上下文需求类型',
293
- yaxis_title='平均得分',
294
  barmode='group',
295
- autosize=True, # 自动调整大小
296
  legend=dict(
297
  orientation="h",
298
  yanchor="top",
299
- y=-0.25, # 调整到更下方
300
  xanchor="center",
301
  x=0.5
302
  ),
303
- margin=dict(b=100) # 增加底部边距
304
  )
305
 
306
  return fig
307
 
308
  def create_primary_task_radar_chart(results, selected_models):
309
- """创建主要任务雷达图(按任务前缀聚合,使用'.'前缀,绘制最多11个任务)"""
310
  if not selected_models:
311
  return go.Figure()
312
 
313
- # 收集所有模型下的任务前缀,保持出现顺序
314
  prefix_order = []
315
- # 为每个模型构建 前缀 -> [scores] 的映射
316
  model_prefix_scores = {}
317
 
318
  for result in results:
@@ -330,16 +335,16 @@ def create_primary_task_radar_chart(results, selected_models):
330
  model_prefix_scores[display_name][prefix] = []
331
  model_prefix_scores[display_name][prefix].append(score * 100)
332
 
333
- # 只取前11个前缀用于绘制
334
  categories = prefix_order[:11]
335
 
336
- # 创建雷达图
337
  fig = go.Figure()
338
 
339
  for model_name in selected_models:
340
  if model_name not in model_prefix_scores:
341
  continue
342
- # 对每个前缀做均值聚合;缺失则为0
343
  values = []
344
  for prefix in categories:
345
  scores = model_prefix_scores[model_name].get(prefix, [])
@@ -347,7 +352,7 @@ def create_primary_task_radar_chart(results, selected_models):
347
  values.append(sum(scores) / len(scores))
348
  else:
349
  values.append(0)
350
- # 闭合多边形
351
  r_values = values + ([values[0]] if values else [])
352
  theta_values = categories + ([categories[0]] if categories else [])
353
  color_index = get_model_color_index(model_name, selected_models)
@@ -362,7 +367,7 @@ def create_primary_task_radar_chart(results, selected_models):
362
  ))
363
 
364
  fig.update_layout(
365
- title='模型在不同主要任务上的性能对比',
366
  polar=dict(
367
  radialaxis=dict(visible=True, range=[0, 100])
368
  ),
@@ -379,11 +384,11 @@ def create_primary_task_radar_chart(results, selected_models):
379
  return fig
380
 
381
  def create_language_chart(results, selected_models):
382
- """创建语言对比柱状图"""
383
  if not selected_models:
384
  return go.Figure()
385
 
386
- # 收集数据 - 直接使用summary中的值,不需要计算平均值
387
  chart_data = {}
388
 
389
  for result in results:
@@ -392,17 +397,17 @@ def create_language_chart(results, selected_models):
392
  model_name = display_name
393
  language = result['language']
394
 
395
- # 直接存储每个模型的结果,不需要计算平均值
396
  if model_name not in chart_data:
397
  chart_data[model_name] = {}
398
 
399
  for lang_type, score in language.items():
400
- chart_data[model_name][lang_type] = score * 100 # 乘以100
401
 
402
- # ��建图表
403
  fig = go.Figure()
404
 
405
- # 获取所有语言类型 - 保持原始顺序,不排序
406
  all_lang_types = []
407
  for result in results:
408
  display_name = get_display_name_for_result(result)
@@ -422,34 +427,34 @@ def create_language_chart(results, selected_models):
422
  x=all_lang_types,
423
  y=scores,
424
  marker_color=get_color(color_index),
425
- text=[f"{score:.2f}" for score in scores], # 保留2位小数
426
  textposition='auto'
427
  ))
428
 
429
  fig.update_layout(
430
- title='模型在不同语言上的性能对比',
431
- xaxis_title='语言类型',
432
- yaxis_title='平均得分',
433
  barmode='group',
434
- autosize=True, # 自动调整大小
435
  legend=dict(
436
  orientation="h",
437
  yanchor="top",
438
- y=-0.25, # 调整到更下方
439
  xanchor="center",
440
  x=0.5
441
  ),
442
- margin=dict(b=100) # 增加底部边距
443
  )
444
 
445
  return fig
446
 
447
  def create_difficulty_chart(results, selected_models):
448
- """创建难度对比柱状图"""
449
  if not selected_models:
450
  return go.Figure()
451
 
452
- # 收集数据 - 直接使用summary中的值,不需要计算平均值
453
  chart_data = {}
454
 
455
  for result in results:
@@ -458,17 +463,17 @@ def create_difficulty_chart(results, selected_models):
458
  model_name = display_name
459
  difficulty = result['difficulty']
460
 
461
- # 直接存储每个模型的结果,不需要计算平均值
462
  if model_name not in chart_data:
463
  chart_data[model_name] = {}
464
 
465
  for diff_type, score in difficulty.items():
466
- chart_data[model_name][diff_type] = score * 100 # 乘以100
467
 
468
- # 创建图表
469
  fig = go.Figure()
470
 
471
- # 获取所有难度类型 - 保持原始顺序,不排序
472
  all_diff_types = []
473
  for result in results:
474
  display_name = get_display_name_for_result(result)
@@ -488,38 +493,38 @@ def create_difficulty_chart(results, selected_models):
488
  x=all_diff_types,
489
  y=scores,
490
  marker_color=get_color(color_index),
491
- text=[f"{score:.2f}" for score in scores], # 保留2位小数
492
  textposition='auto'
493
  ))
494
 
495
  fig.update_layout(
496
- title='模型在不同难度上的性能对比',
497
- xaxis_title='难度类型',
498
- yaxis_title='平均得分',
499
  barmode='group',
500
- autosize=True, # 自动调整大小
501
  legend=dict(
502
  orientation="h",
503
  yanchor="top",
504
- y=-0.25, # 调整到更下方
505
  xanchor="center",
506
  x=0.5
507
  ),
508
- margin=dict(b=100) # 增加底部边距
509
  )
510
 
511
  return fig
512
 
513
  def create_length_heatmap(results, selected_models):
514
- """创建长度热力图:横坐标为长度,纵坐标为模型"""
515
  if not selected_models:
516
  return go.Figure()
517
 
518
- # 定义标准的context长度范围:8k, 16k, 32k, 64k, 128k, 256k
519
  standard_lengths = [8000, 16000, 32000, 64000, 128000, 256000]
520
  standard_length_keys = ['8k', '16k', '32k', '64k', '128k', '256k']
521
 
522
- # 准备热力图数据
523
  heatmap_data = []
524
  model_names = []
525
 
@@ -528,52 +533,52 @@ def create_length_heatmap(results, selected_models):
528
  if display_name in selected_models:
529
  model_names.append(display_name)
530
 
531
- # token_length_metrics中获取数据
532
  token_length_metrics = result.get('token_length_metrics', {})
533
  row_data = []
534
 
535
  for key in standard_length_keys:
536
  if key in token_length_metrics:
537
- row_data.append(token_length_metrics[key] * 100) # 乘以100转换为百分比
538
  else:
539
- row_data.append(None) # 没有数据点
540
 
541
  heatmap_data.append(row_data)
542
 
543
- # 创建热力图
544
  fig = go.Figure(data=go.Heatmap(
545
  z=heatmap_data,
546
- x=[f"{length//1000}k" for length in standard_lengths], # x轴标签
547
- y=model_names, # y轴标签
548
- colorscale='RdYlBu_r', # 颜色映射:红色表示低分,蓝色表示高分
549
  showscale=True,
550
- text=[[f"{val:.2f}" if val is not None else "N/A" for val in row] for row in heatmap_data], # 显示数值
551
  texttemplate="%{text}",
552
  textfont={"size": 10},
553
  hoverongaps=False
554
  ))
555
 
556
  fig.update_layout(
557
- title='模型在不同Context长度上的性能热力图',
558
- xaxis_title='Context长度 (tokens)',
559
- yaxis_title='模型名称',
560
  autosize=True,
561
- height=max(400, len(model_names) * 50), # 根据模型数量调整高度
562
- margin=dict(l=150, r=50, t=80, b=80) # 调整边距,左侧留更多空间给模型名称
563
  )
564
 
565
  return fig
566
 
567
  def create_bon_chart(results, selected_models):
568
- """创建BoN 1-3折线图,显示overall_metric"""
569
  if not selected_models:
570
  return go.Figure()
571
 
572
- # BoN 标签
573
  bon_labels = ['BoN-1', 'BoN-2', 'BoN-3']
574
  bon_indices = [1, 2, 3]
575
 
576
- # 为每个模型准备数据
577
  model_data = {}
578
  for result in results:
579
  display_name = get_display_name_for_result(result)
@@ -581,21 +586,21 @@ def create_bon_chart(results, selected_models):
581
  if display_name not in model_data:
582
  model_data[display_name] = {}
583
 
584
- # bon_data中获取数据
585
  bon_data = result.get('bon_data', {})
586
  for bon_key in bon_labels:
587
  if bon_key in bon_data:
588
  bon_index = bon_labels.index(bon_key) + 1
589
- model_data[display_name][bon_index] = bon_data[bon_key] * 100 # 乘以100转换为百分比
590
 
591
- # 创建图表
592
  fig = go.Figure()
593
 
594
  for model_name, data in model_data.items():
595
  if not data:
596
  continue
597
 
598
- # 为每个BoN准备数据
599
  x_values = []
600
  y_values = []
601
  text_values = []
@@ -609,7 +614,7 @@ def create_bon_chart(results, selected_models):
609
  y_values.append(None)
610
  text_values.append("")
611
 
612
- # 获取模型颜色索引
613
  color_index = get_model_color_index(model_name, selected_models)
614
 
615
  fig.add_trace(go.Scatter(
@@ -624,11 +629,11 @@ def create_bon_chart(results, selected_models):
624
  connectgaps=False
625
  ))
626
 
627
- # 设置x
628
  fig.update_layout(
629
- title='模型在不同Best-of-N下的对比',
630
  xaxis_title='N',
631
- yaxis_title='平均得分',
632
  autosize=True,
633
  xaxis=dict(
634
  tickmode='array',
@@ -649,15 +654,15 @@ def create_bon_chart(results, selected_models):
649
  return fig
650
 
651
  def create_pass_k_chart(results, selected_models):
652
- """创建Pass@N 折线图"""
653
  if not selected_models:
654
  return go.Figure()
655
 
656
- # Pass@K 标签
657
- k_labels = ['pass@1', 'pass@2', 'pass@3']
658
  k_indices = [1, 2, 3]
659
 
660
- # 为每个模型准备数据
661
  model_data = {}
662
  for result in results:
663
  display_name = get_display_name_for_result(result)
@@ -665,22 +670,22 @@ def create_pass_k_chart(results, selected_models):
665
  if display_name not in model_data:
666
  model_data[display_name] = {}
667
 
668
- # pass_at_k中获取数据
669
  pass_data = result.get('pass_at_k', {})
670
  for i, k_key in enumerate(k_labels):
671
  val = pass_data.get(k_key)
672
  if val is not None:
673
  k_index = k_indices[i]
674
- model_data[display_name][k_index] = val * 100 # 乘以100转换为百分比
675
 
676
- # 创建图表
677
  fig = go.Figure()
678
 
679
  for model_name, data in model_data.items():
680
  if not data:
681
  continue
682
 
683
- # 为每个Pass@K准备数据
684
  x_values = []
685
  y_values = []
686
  text_values = []
@@ -694,7 +699,7 @@ def create_pass_k_chart(results, selected_models):
694
  y_values.append(None)
695
  text_values.append("")
696
 
697
- # 获取模型颜色索引
698
  color_index = get_model_color_index(model_name, selected_models)
699
 
700
  fig.add_trace(go.Scatter(
@@ -709,9 +714,9 @@ def create_pass_k_chart(results, selected_models):
709
  connectgaps=False
710
  ))
711
 
712
- # 设置x
713
  fig.update_layout(
714
- title='模型在不同Pass@N下的对比',
715
  xaxis_title='N',
716
  yaxis_title='Pass@N (%)',
717
  autosize=True,
@@ -734,15 +739,15 @@ def create_pass_k_chart(results, selected_models):
734
  return fig
735
 
736
  def create_gradio_interface(parser: ResultParser):
737
- """创建Gradio界面"""
738
 
739
  def refresh_data():
740
- """刷新数据"""
741
  parser.scan_all_results()
742
  return parser.get_leaderboard_data()
743
 
744
  def get_model_choices():
745
- """获取模型选择列表(按是否包含Thinking或NonThinking区分,以相应后缀标识)"""
746
  if not parser.results:
747
  return []
748
  display_names = set()
@@ -753,7 +758,7 @@ def create_gradio_interface(parser: ResultParser):
753
  return models
754
 
755
  def update_charts(selected_models):
756
- """更新所有图表"""
757
  if not selected_models:
758
  return None, None, None, None, None, None, None
759
 
@@ -767,9 +772,9 @@ def create_gradio_interface(parser: ResultParser):
767
 
768
  return length_heatmap, contextual_chart, primary_task_radar_chart, language_chart, difficulty_chart, bon_chart, pass_k_chart
769
 
770
- # 创建界面
771
- with gr.Blocks(title="LongBench Pro 结果可视化", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
772
- gr.Markdown("# LongBench Pro 结果可视化")
773
 
774
  gr.HTML("""
775
  <div style="text-align: center; display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
@@ -780,14 +785,14 @@ def create_gradio_interface(parser: ResultParser):
780
  </div>
781
  """)
782
 
783
- # 排行榜区域
784
- gr.Markdown("## 🏆 总体性能排行榜")
785
  gr.Markdown("""
786
- - *思考模型和混合思考模型的思考得分,使用本身的思考能力(Non-Thinking Prompt)*
787
- - *指令模型的思考得分,使用思考提示获得(Thinking Prompt)*
788
  """)
789
  leaderboard_df = gr.Dataframe(
790
- headers=["模型名称", "模型类型", "上下文长度", "截断长度", "非思考得分", "思考得分"],
791
  datatype=["markdown", "str", "str", "str", "str", "str"],
792
  interactive=False,
793
  wrap=True,
@@ -798,47 +803,47 @@ def create_gradio_interface(parser: ResultParser):
798
  elem_id="leaderboard_table"
799
  )
800
 
801
- # 模型筛选和图表区域
802
  gr.HTML("<br>")
803
- gr.Markdown("## 📊 特定维度对比")
804
  with gr.Row():
805
  with gr.Column(scale=4):
806
  model_selector = gr.Dropdown(
807
  choices=[],
808
- label="选择模型",
809
  value=[],
810
  multiselect=True,
811
  interactive=True
812
  )
813
  with gr.Column(scale=1):
814
- update_charts_btn = gr.Button("更新图表", variant="primary", size="lg")
815
 
816
  with gr.Tabs():
817
- with gr.TabItem("语言维度"):
818
- language_plot = gr.Plot()
819
 
820
- with gr.TabItem("难度维度"):
821
- difficulty_plot = gr.Plot()
822
 
823
- with gr.TabItem("长度维度"):
824
- length_heatmap = gr.Plot()
825
 
826
- with gr.TabItem("主要任务维度"):
827
- primary_task_radar_plot = gr.Plot()
828
 
829
- with gr.TabItem("上下文需求维度"):
830
- contextual_plot = gr.Plot()
831
 
832
- with gr.TabItem("BoN维度"):
833
- bon_plot = gr.Plot()
834
 
835
- with gr.TabItem("Pass@N维度"):
836
- pass_k_plot = gr.Plot()
837
 
838
- # 增加底部空白区域,确保下拉框有足够空间向下展开,而不是因为底部空间不足而向上展开
839
  gr.HTML("<div style='height: 100px;'></div>")
840
 
841
- # 事件处理
842
  def update_model_choices():
843
  models = get_model_choices()
844
  return gr.Dropdown(choices=models, value=[])
@@ -849,7 +854,7 @@ def create_gradio_interface(parser: ResultParser):
849
  outputs=[length_heatmap, contextual_plot, primary_task_radar_plot, language_plot, difficulty_plot, bon_plot, pass_k_plot]
850
  )
851
 
852
- # 初始化 - 页面加载时自动刷新数据
853
  demo.load(
854
  fn=refresh_data,
855
  outputs=[leaderboard_df]
@@ -861,19 +866,19 @@ def create_gradio_interface(parser: ResultParser):
861
  return demo
862
 
863
  def main():
864
- """主函数"""
865
  output_dir = "./results"
866
 
867
- print("初始化结果解析器...")
868
  parser = ResultParser(output_dir)
869
 
870
- print("扫描结果文件...")
871
  parser.scan_all_results()
872
 
873
- print("创建Gradio界面...")
874
  demo = create_gradio_interface(parser)
875
 
876
- print("启动服务器...")
877
  demo.launch()
878
 
879
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ LongBenchmark Results Visualization
5
  """
6
 
7
  import json
 
15
  MODLE_INFO_DICT = json.load(f)
16
 
17
  def get_color(index):
18
+ """Generate color based on index, using golden angle to ensure uniform and infinite color distribution"""
19
+ # Golden angle approx 137.508 degrees
20
  hue = (index * 137.508) % 360
21
+ # Fixed saturation 70%, lightness 60%
22
  return f"hsl({hue}, 70%, 60%)"
23
 
24
+ # Custom CSS
25
  CUSTOM_CSS = """
26
+ /* Force title center */
27
  h1 {
28
  text-align: center;
29
  display: block;
30
  }
31
 
32
+ /* Header center */
33
  #leaderboard_table th,
34
  #leaderboard_table th button,
35
  #leaderboard_table th span {
 
37
  justify-content: center !important;
38
  }
39
 
40
+ /* Content column center: starting from 3rd column */
41
  #leaderboard_table td:nth-child(n+3) {
42
  text-align: center !important;
43
  }
44
+
45
+ /* Make tab labels bold */
46
+ button[role="tab"] {
47
+ font-weight: bold !important;
48
+ }
49
  """
50
 
51
  class ResultParser:
 
54
  self.results = []
55
 
56
  def parse_filename(self, filename: str):
57
+ """Parse filename to extract context length and thinking status"""
58
+ # Extract context length
59
  context_match = re.search(r'context-(\d+)', filename)
60
  context_length = int(context_match.group(1)) if context_match else 0
61
 
62
  filename_lower = filename.lower()
63
+ # Check nonthinking
64
  has_nonthinking = 'nonthinking' in filename_lower
65
+ # Check thinking
66
  has_thinking = 'thinking' in filename_lower and not has_nonthinking
67
 
68
  return context_length, has_thinking, has_nonthinking
69
 
70
  def parse_result_file(self, model_name: str, file_path: Path):
71
+ """Parse single result file"""
72
  try:
73
  with open(file_path, 'r', encoding='utf-8') as f:
74
  data = json.load(f)
75
 
76
  context_length, has_thinking, has_nonthinking = self.parse_filename(file_path.name)
77
+ # Use date field as evaluation date
78
+ eval_date = data.get('date', "Unknown")
79
 
80
+ # Extract BoN data
81
  bon_data = {}
82
  for bon_key in ['BoN-1', 'BoN-2', 'BoN-3']:
83
  if bon_key in data and 'overall_metric' in data[bon_key]:
 
95
  'difficulty': data.get('average_difficulty_metric', {}),
96
  'primary_task': data.get('average_primary_task_metric', {}),
97
  'language': data.get('average_language_metric', {}),
98
+ 'bon_data': bon_data, # Store BoN-1, BoN-2, BoN-3 overall_metric
99
  'pass_at_k': {
100
  'pass@1': data.get('pass@1'),
101
  'pass@2': data.get('pass@2'),
 
106
  return result
107
 
108
  except Exception as e:
109
+ print(f"Error parsing file {file_path}: {e}")
110
  return None
111
 
112
  def scan_all_results(self):
113
+ """Scan all model result files"""
114
  self.results = []
115
 
116
  if not self.output_dir.exists():
117
+ print(f"Output directory does not exist: {self.output_dir}")
118
  return
119
 
120
+ # Traverse all model directories
121
  for model_dir in self.output_dir.iterdir():
122
  if not model_dir.is_dir():
123
  continue
124
 
125
  model_name = model_dir.name
126
+ print(f"Scanning model: {model_name}")
127
 
128
+ # Find all _summary.json files
129
  for file_path in model_dir.glob("*_summary.json"):
130
+ print(f" Parsing file: {file_path.name}")
131
  result = self.parse_result_file(model_name, file_path)
132
  if result:
133
  self.results.append(result)
134
 
135
+ print(f"Total parsed {len(self.results)} result files")
136
 
137
  def get_leaderboard_data(self):
138
+ """Get leaderboard data"""
139
  if not self.results:
140
  return pd.DataFrame()
141
 
142
+ # Aggregate data by model name
143
  model_groups = {}
144
  for result in self.results:
145
  model_name = result['model_name']
 
163
 
164
  leaderboard_data = []
165
  for model_name, group in model_groups.items():
166
+ # Get latest date
167
+ valid_dates = [d for d in group['dates'] if d != "Unknown"]
168
+ latest_date = max(valid_dates) if valid_dates else "Unknown"
169
 
170
+ # Get max Context Window
171
  max_context = max(group['contexts']) if group['contexts'] else 0
172
 
173
+ # Format truncated length
174
  if max_context >= 1000000:
175
  context_str = f"{max_context/1000000:.0f}M" if max_context % 1000000 == 0 else f"{max_context/1000000:.1f}M"
176
  elif max_context >= 1000:
 
178
  else:
179
  context_str = str(max_context)
180
 
181
+ # Get model type and context length
182
  model_context = "-"
183
  model_url = ""
184
  if model_name in MODLE_INFO_DICT:
 
192
  else:
193
  model_type = "Unknown"
194
 
195
+ # Handle model name link and icon
196
  display_model_name = model_name
197
 
198
  if model_url:
199
  display_model_name = f"[{display_model_name}]({model_url})"
200
 
201
+ # Calculate average score
202
  nt_score_val = 0
203
  nt_score_str = "-"
204
  if group['non_thinking_scores']:
 
212
  t_score_str = f"{t_score_val * 100:.2f}"
213
 
214
  leaderboard_data.append({
215
+ 'Model Name': display_model_name,
216
+ 'Model Type': model_type,
217
+ 'Context Length': model_context,
218
+ 'Truncated Length': context_str,
219
+ 'Non-Thinking Score': nt_score_str,
220
+ 'Thinking Score': t_score_str,
221
  '_sort_score': max(nt_score_val, t_score_val)
222
  })
223
 
224
  df = pd.DataFrame(leaderboard_data)
225
+ # Sort by highest score descending
226
  if not df.empty:
227
  df = df.sort_values('_sort_score', ascending=False).drop(columns=['_sort_score']).reset_index(drop=True)
228
 
229
  return df
230
 
231
  def get_display_name_for_result(result):
232
+ """Get display name for model (append suffix based on thinking/nonthinking)"""
233
  if result.get('has_nonthinking'):
234
  return f"{result['model_name']}_nonthinking"
235
  elif result.get('has_thinking'):
 
238
  return result['model_name']
239
 
240
  def get_model_color_index(model_name, all_models):
241
+ """Get model index in color list"""
242
  try:
243
  return all_models.index(model_name)
244
  except ValueError:
245
  return 0
246
 
247
  def create_contextual_requirement_chart(results, selected_models):
248
+ """Create contextual requirement comparison bar chart"""
249
  if not selected_models:
250
  return go.Figure()
251
 
252
+ # Collect data
253
  chart_data = {}
254
 
255
  for result in results:
 
258
  model_name = display_name
259
  contextual_requirement = result['contextual_requirement']
260
 
261
+ # Store each model's result directly
262
  if model_name not in chart_data:
263
  chart_data[model_name] = {}
264
 
265
  for req_type, score in contextual_requirement.items():
266
+ chart_data[model_name][req_type] = score * 100 # multiply by 100
267
 
268
+ # Create chart
269
  fig = go.Figure()
270
 
271
+ # Get all requirement types
272
  all_req_types = []
273
  for result in results:
274
  display_name = get_display_name_for_result(result)
 
288
  x=all_req_types,
289
  y=scores,
290
  marker_color=get_color(color_index),
291
+ text=[f"{score:.2f}" for score in scores], # keep 2 decimal places
292
  textposition='auto'
293
  ))
294
 
295
  fig.update_layout(
296
+ title='Performance Comparison on Different Context Requirements',
297
+ xaxis_title='Context Requirement Type',
298
+ yaxis_title='Average Score',
299
  barmode='group',
300
+ autosize=True, # auto size
301
  legend=dict(
302
  orientation="h",
303
  yanchor="top",
304
+ y=-0.25, # adjust lower
305
  xanchor="center",
306
  x=0.5
307
  ),
308
+ margin=dict(b=100) # increase bottom margin
309
  )
310
 
311
  return fig
312
 
313
  def create_primary_task_radar_chart(results, selected_models):
314
+ """Create primary task radar chart (aggregate by prefix)"""
315
  if not selected_models:
316
  return go.Figure()
317
 
318
+ # Collect all model task prefixes
319
  prefix_order = []
320
+ # Map prefix -> [scores] for each model
321
  model_prefix_scores = {}
322
 
323
  for result in results:
 
335
  model_prefix_scores[display_name][prefix] = []
336
  model_prefix_scores[display_name][prefix].append(score * 100)
337
 
338
+ # Take first 11 prefixes
339
  categories = prefix_order[:11]
340
 
341
+ # Create radar chart
342
  fig = go.Figure()
343
 
344
  for model_name in selected_models:
345
  if model_name not in model_prefix_scores:
346
  continue
347
+ # Mean aggregation for each prefix
348
  values = []
349
  for prefix in categories:
350
  scores = model_prefix_scores[model_name].get(prefix, [])
 
352
  values.append(sum(scores) / len(scores))
353
  else:
354
  values.append(0)
355
+ # Close polygon
356
  r_values = values + ([values[0]] if values else [])
357
  theta_values = categories + ([categories[0]] if categories else [])
358
  color_index = get_model_color_index(model_name, selected_models)
 
367
  ))
368
 
369
  fig.update_layout(
370
+ title='Performance Comparison on Different Primary Tasks',
371
  polar=dict(
372
  radialaxis=dict(visible=True, range=[0, 100])
373
  ),
 
384
  return fig
385
 
386
  def create_language_chart(results, selected_models):
387
+ """Create language comparison bar chart"""
388
  if not selected_models:
389
  return go.Figure()
390
 
391
+ # Collect data
392
  chart_data = {}
393
 
394
  for result in results:
 
397
  model_name = display_name
398
  language = result['language']
399
 
400
+ # Store each model's result directly
401
  if model_name not in chart_data:
402
  chart_data[model_name] = {}
403
 
404
  for lang_type, score in language.items():
405
+ chart_data[model_name][lang_type] = score * 100 # multiply by 100
406
 
407
+ # Create chart
408
  fig = go.Figure()
409
 
410
+ # Get all language types
411
  all_lang_types = []
412
  for result in results:
413
  display_name = get_display_name_for_result(result)
 
427
  x=all_lang_types,
428
  y=scores,
429
  marker_color=get_color(color_index),
430
+ text=[f"{score:.2f}" for score in scores], # keep 2 decimal places
431
  textposition='auto'
432
  ))
433
 
434
  fig.update_layout(
435
+ title='Performance Comparison on Different Languages',
436
+ xaxis_title='Language Type',
437
+ yaxis_title='Average Score',
438
  barmode='group',
439
+ autosize=True, # auto size
440
  legend=dict(
441
  orientation="h",
442
  yanchor="top",
443
+ y=-0.25, # adjust lower
444
  xanchor="center",
445
  x=0.5
446
  ),
447
+ margin=dict(b=100) # increase bottom margin
448
  )
449
 
450
  return fig
451
 
452
  def create_difficulty_chart(results, selected_models):
453
+ """Create difficulty comparison bar chart"""
454
  if not selected_models:
455
  return go.Figure()
456
 
457
+ # Collect data
458
  chart_data = {}
459
 
460
  for result in results:
 
463
  model_name = display_name
464
  difficulty = result['difficulty']
465
 
466
+ # Store each model's result directly
467
  if model_name not in chart_data:
468
  chart_data[model_name] = {}
469
 
470
  for diff_type, score in difficulty.items():
471
+ chart_data[model_name][diff_type] = score * 100 # multiply by 100
472
 
473
+ # Create chart
474
  fig = go.Figure()
475
 
476
+ # Get all difficulty types
477
  all_diff_types = []
478
  for result in results:
479
  display_name = get_display_name_for_result(result)
 
493
  x=all_diff_types,
494
  y=scores,
495
  marker_color=get_color(color_index),
496
+ text=[f"{score:.2f}" for score in scores], # keep 2 decimal places
497
  textposition='auto'
498
  ))
499
 
500
  fig.update_layout(
501
+ title='Performance Comparison on Different Difficulties',
502
+ xaxis_title='Difficulty Type',
503
+ yaxis_title='Average Score',
504
  barmode='group',
505
+ autosize=True, # auto size
506
  legend=dict(
507
  orientation="h",
508
  yanchor="top",
509
+ y=-0.25, # adjust lower
510
  xanchor="center",
511
  x=0.5
512
  ),
513
+ margin=dict(b=100) # increase bottom margin
514
  )
515
 
516
  return fig
517
 
518
  def create_length_heatmap(results, selected_models):
519
+ """Create length heatmap"""
520
  if not selected_models:
521
  return go.Figure()
522
 
523
+ # Standard context lengths
524
  standard_lengths = [8000, 16000, 32000, 64000, 128000, 256000]
525
  standard_length_keys = ['8k', '16k', '32k', '64k', '128k', '256k']
526
 
527
+ # Prepare heatmap data
528
  heatmap_data = []
529
  model_names = []
530
 
 
533
  if display_name in selected_models:
534
  model_names.append(display_name)
535
 
536
+ # Get data from token_length_metrics
537
  token_length_metrics = result.get('token_length_metrics', {})
538
  row_data = []
539
 
540
  for key in standard_length_keys:
541
  if key in token_length_metrics:
542
+ row_data.append(token_length_metrics[key] * 100) # multiply by 100
543
  else:
544
+ row_data.append(None) # No data point
545
 
546
  heatmap_data.append(row_data)
547
 
548
+ # Create heatmap
549
  fig = go.Figure(data=go.Heatmap(
550
  z=heatmap_data,
551
+ x=[f"{length//1000}k" for length in standard_lengths], # x axis labels
552
+ y=model_names, # y axis labels
553
+ colorscale='RdYlBu_r', # Red is low, Blue is high
554
  showscale=True,
555
+ text=[[f"{val:.2f}" if val is not None else "N/A" for val in row] for row in heatmap_data], # show values
556
  texttemplate="%{text}",
557
  textfont={"size": 10},
558
  hoverongaps=False
559
  ))
560
 
561
  fig.update_layout(
562
+ title='Performance Heatmap on Different Sample Lengths',
563
+ xaxis_title='Sample Length (tokens)',
564
+ yaxis_title='Model Name',
565
  autosize=True,
566
+ height=max(400, len(model_names) * 50), # adjust height based on model count
567
+ margin=dict(l=150, r=50, t=80, b=80) # adjust margins
568
  )
569
 
570
  return fig
571
 
572
  def create_bon_chart(results, selected_models):
573
+ """Create BoN 1-3 line chart"""
574
  if not selected_models:
575
  return go.Figure()
576
 
577
+ # BoN labels
578
  bon_labels = ['BoN-1', 'BoN-2', 'BoN-3']
579
  bon_indices = [1, 2, 3]
580
 
581
+ # Prepare data for each model
582
  model_data = {}
583
  for result in results:
584
  display_name = get_display_name_for_result(result)
 
586
  if display_name not in model_data:
587
  model_data[display_name] = {}
588
 
589
+ # Get data from bon_data
590
  bon_data = result.get('bon_data', {})
591
  for bon_key in bon_labels:
592
  if bon_key in bon_data:
593
  bon_index = bon_labels.index(bon_key) + 1
594
+ model_data[display_name][bon_index] = bon_data[bon_key] * 100 # multiply by 100
595
 
596
+ # Create chart
597
  fig = go.Figure()
598
 
599
  for model_name, data in model_data.items():
600
  if not data:
601
  continue
602
 
603
+ # Prepare data for each BoN
604
  x_values = []
605
  y_values = []
606
  text_values = []
 
614
  y_values.append(None)
615
  text_values.append("")
616
 
617
+ # Get model color index
618
  color_index = get_model_color_index(model_name, selected_models)
619
 
620
  fig.add_trace(go.Scatter(
 
629
  connectgaps=False
630
  ))
631
 
632
+ # Set x axis
633
  fig.update_layout(
634
+ title='Performance Comparison on Different Best-of-N',
635
  xaxis_title='N',
636
+ yaxis_title='Average Score',
637
  autosize=True,
638
  xaxis=dict(
639
  tickmode='array',
 
654
  return fig
655
 
656
  def create_pass_k_chart(results, selected_models):
657
+ """Create Pass@N line chart"""
658
  if not selected_models:
659
  return go.Figure()
660
 
661
+ # Pass@K labels
662
+ k_labels = ['Pass@1', 'Pass@2', 'Pass@3']
663
  k_indices = [1, 2, 3]
664
 
665
+ # Prepare data for each model
666
  model_data = {}
667
  for result in results:
668
  display_name = get_display_name_for_result(result)
 
670
  if display_name not in model_data:
671
  model_data[display_name] = {}
672
 
673
+ # Get data from pass_at_k
674
  pass_data = result.get('pass_at_k', {})
675
  for i, k_key in enumerate(k_labels):
676
  val = pass_data.get(k_key)
677
  if val is not None:
678
  k_index = k_indices[i]
679
+ model_data[display_name][k_index] = val * 100 # multiply by 100
680
 
681
+ # Create chart
682
  fig = go.Figure()
683
 
684
  for model_name, data in model_data.items():
685
  if not data:
686
  continue
687
 
688
+ # Prepare data for each Pass@K
689
  x_values = []
690
  y_values = []
691
  text_values = []
 
699
  y_values.append(None)
700
  text_values.append("")
701
 
702
+ # Get model color index
703
  color_index = get_model_color_index(model_name, selected_models)
704
 
705
  fig.add_trace(go.Scatter(
 
714
  connectgaps=False
715
  ))
716
 
717
+ # Set x axis
718
  fig.update_layout(
719
+ title='Performance Comparison on Different Pass@N',
720
  xaxis_title='N',
721
  yaxis_title='Pass@N (%)',
722
  autosize=True,
 
739
  return fig
740
 
741
  def create_gradio_interface(parser: ResultParser):
742
+ """Create Gradio interface"""
743
 
744
  def refresh_data():
745
+ """Refresh data"""
746
  parser.scan_all_results()
747
  return parser.get_leaderboard_data()
748
 
749
  def get_model_choices():
750
+ """Get model choices (distinguish by suffix for thinking/nonthinking)"""
751
  if not parser.results:
752
  return []
753
  display_names = set()
 
758
  return models
759
 
760
  def update_charts(selected_models):
761
+ """Update all charts"""
762
  if not selected_models:
763
  return None, None, None, None, None, None, None
764
 
 
772
 
773
  return length_heatmap, contextual_chart, primary_task_radar_chart, language_chart, difficulty_chart, bon_chart, pass_k_chart
774
 
775
+ # Create interface
776
+ with gr.Blocks(title="LongBench Pro Results Visualization", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
777
+ gr.Markdown("# LongBench Pro Results Visualization")
778
 
779
  gr.HTML("""
780
  <div style="text-align: center; display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
 
785
  </div>
786
  """)
787
 
788
+ # Leaderboard area
789
+ gr.Markdown("## 🏆 Overall Performance Leaderboard")
790
  gr.Markdown("""
791
+ - *Thinking scores for Thinking and Mixed-Thinking models use their own thinking capabilities (Non-Thinking Prompt)*
792
+ - *Thinking scores for Instruct models are obtained using thinking prompts (Thinking Prompt)*
793
  """)
794
  leaderboard_df = gr.Dataframe(
795
+ headers=["Model Name", "Model Type", "Context Length", "Truncation Length", "Non-Thinking Score", "Thinking Score"],
796
  datatype=["markdown", "str", "str", "str", "str", "str"],
797
  interactive=False,
798
  wrap=True,
 
803
  elem_id="leaderboard_table"
804
  )
805
 
806
+ # Model selection and chart area
807
  gr.HTML("<br>")
808
+ gr.Markdown("## 📊 Specific Dimension Comparison")
809
  with gr.Row():
810
  with gr.Column(scale=4):
811
  model_selector = gr.Dropdown(
812
  choices=[],
813
+ label="Select Models",
814
  value=[],
815
  multiselect=True,
816
  interactive=True
817
  )
818
  with gr.Column(scale=1):
819
+ update_charts_btn = gr.Button("Update Charts", variant="primary", size="lg")
820
 
821
  with gr.Tabs():
822
+ with gr.TabItem("Language"):
823
+ language_plot = gr.Plot(show_label=False)
824
 
825
+ with gr.TabItem("Difficulty"):
826
+ difficulty_plot = gr.Plot(show_label=False)
827
 
828
+ with gr.TabItem("Sample Length"):
829
+ length_heatmap = gr.Plot(show_label=False)
830
 
831
+ with gr.TabItem("Primary Task"):
832
+ primary_task_radar_plot = gr.Plot(show_label=False)
833
 
834
+ with gr.TabItem("Context Requirement"):
835
+ contextual_plot = gr.Plot(show_label=False)
836
 
837
+ with gr.TabItem("Best-of-N"):
838
+ bon_plot = gr.Plot(show_label=False)
839
 
840
+ with gr.TabItem("Pass@N"):
841
+ pass_k_plot = gr.Plot(show_label=False)
842
 
843
+ # Add bottom spacer
844
  gr.HTML("<div style='height: 100px;'></div>")
845
 
846
+ # Event handling
847
  def update_model_choices():
848
  models = get_model_choices()
849
  return gr.Dropdown(choices=models, value=[])
 
854
  outputs=[length_heatmap, contextual_plot, primary_task_radar_plot, language_plot, difficulty_plot, bon_plot, pass_k_plot]
855
  )
856
 
857
+ # Initialize
858
  demo.load(
859
  fn=refresh_data,
860
  outputs=[leaderboard_df]
 
866
  return demo
867
 
868
  def main():
869
+ """Main function"""
870
  output_dir = "./results"
871
 
872
+ print("Initializing result parser...")
873
  parser = ResultParser(output_dir)
874
 
875
+ print("Scanning result files...")
876
  parser.scan_all_results()
877
 
878
+ print("Creating Gradio interface...")
879
  demo = create_gradio_interface(parser)
880
 
881
+ print("Starting server...")
882
  demo.launch()
883
 
884
  if __name__ == "__main__":