mirageco commited on
Commit
efbe6b2
·
1 Parent(s): b90152d

Replace leaderboard data with MultiFinBen Table 3 scores

Browse files

- Restructure evaluation columns from language averages to individual datasets (40 columns)
- Add all 29 text datasets: EN/ZH/JA/ES/EL/BI/MU tasks + averages
- Add vision datasets: EnglishOCR, TableBench, JapaneseOCR, SpanishOCR, GreekOCR + average
- Add audio datasets: MDRM-test, FinAudioSum + average + modality-balanced average
- Fill all scores from MultiFinBen Table 3 for 21 models

frontend/src/pages/LeaderboardPage/LeaderboardPage.js CHANGED
@@ -11,11 +11,10 @@ function LeaderboardPage() {
11
  const { actions } = useLeaderboard();
12
 
13
  useEffect(() => {
14
- if (data) {
15
- actions.setModels(data);
 
16
  }
17
- actions.setLoading(isLoading);
18
- actions.setError(error);
19
  }, [data, isLoading, error, actions]);
20
 
21
  return (
 
11
  const { actions } = useLeaderboard();
12
 
13
  useEffect(() => {
14
+ actions.setModels(data || []);
15
+ if (!error) {
16
+ actions.setLoading(isLoading);
17
  }
 
 
18
  }, [data, isLoading, error, actions]);
19
 
20
  return (
frontend/src/pages/LeaderboardPage/components/Leaderboard/Leaderboard.js CHANGED
@@ -215,7 +215,7 @@ const Leaderboard = () => {
215
 
216
  // Memoize loading states
217
  const loadingStates = useMemo(() => {
218
- const isInitialLoading = dataLoading || !data;
219
  const isProcessingData = !memoizedTable || !memoizedFilteredData;
220
  const isApplyingFilters = state.models.length > 0 && !memoizedFilteredData;
221
  const hasValidFilterCounts =
@@ -367,11 +367,11 @@ const Leaderboard = () => {
367
  ]);
368
 
369
  // If an error occurred, display it
370
- if (dataError || processingError) {
371
  return (
372
  <Box sx={{ p: 3, textAlign: "center" }}>
373
  <Typography color="error">
374
- {(dataError || processingError)?.message ||
375
  "An error occurred while loading the data"}
376
  </Typography>
377
  </Box>
 
215
 
216
  // Memoize loading states
217
  const loadingStates = useMemo(() => {
218
+ const isInitialLoading = dataLoading && !memoizedFilteredData?.length;
219
  const isProcessingData = !memoizedTable || !memoizedFilteredData;
220
  const isApplyingFilters = state.models.length > 0 && !memoizedFilteredData;
221
  const hasValidFilterCounts =
 
367
  ]);
368
 
369
  // If an error occurred, display it
370
+ if (processingError) {
371
  return (
372
  <Box sx={{ p: 3, textAlign: "center" }}>
373
  <Typography color="error">
374
+ {processingError?.message ||
375
  "An error occurred while loading the data"}
376
  </Typography>
377
  </Box>
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/allowedModels.js CHANGED
@@ -14,7 +14,6 @@ export const ALLOWED_MODELS = [
14
  "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese",
15
  "TheFinAI/FinMA-ES-Bilingual",
16
  "TheFinAI/plutus-8B-instruct",
17
- "Qwen-VL-MAX",
18
  "LLaVA-1.6 Vicuna-13B",
19
  "Deepseek-VL-7B-Chat",
20
  "Whisper-V3",
 
14
  "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese",
15
  "TheFinAI/FinMA-ES-Bilingual",
16
  "TheFinAI/plutus-8B-instruct",
 
17
  "LLaVA-1.6 Vicuna-13B",
18
  "Deepseek-VL-7B-Chat",
19
  "Whisper-V3",
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED
@@ -139,12 +139,46 @@ const COLUMNS = {
139
  },
140
  },
141
  EVALUATION: {
142
- "evaluations.greek_average": {
143
- group: "evaluation",
144
- size: COLUMN_SIZES.BENCHMARK,
145
- defaultVisible: true,
146
- label: "Greek Financial LLM Leaderboard",
147
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  },
149
  MODEL_INFO: {
150
  "metadata.co2_cost": {
 
139
  },
140
  },
141
  EVALUATION: {
142
+ "evaluations.sc": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "SC" },
143
+ "evaluations.finred": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinRED" },
144
+ "evaluations.finer_ord": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FINER-ORD" },
145
+ "evaluations.headlines": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Headlines" },
146
+ "evaluations.tsa_en": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TSA" },
147
+ "evaluations.xbrl_math": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "XBRL-Math" },
148
+ "evaluations.finqa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinQA" },
149
+ "evaluations.tatqa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TATQA" },
150
+ "evaluations.ectsum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "ECTSUM" },
151
+ "evaluations.edtsum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "EDTSUM" },
152
+ "evaluations.ccf": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "CCF" },
153
+ "evaluations.bigdata22": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "BigData22" },
154
+ "evaluations.msft": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "MSFT" },
155
+ "evaluations.rre": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "RRE" },
156
+ "evaluations.aie": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "AIE" },
157
+ "evaluations.lne": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "LNE" },
158
+ "evaluations.financialiq": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinancialIQ" },
159
+ "evaluations.chabsa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "chabsa" },
160
+ "evaluations.multifin": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "MultiFin" },
161
+ "evaluations.tsa_es": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TSA (ES)" },
162
+ "evaluations.efpa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "EFPA" },
163
+ "evaluations.fns_2023": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FNS-2023" },
164
+ "evaluations.grfinnum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRFinNUM" },
165
+ "evaluations.grfinqa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRFinQA" },
166
+ "evaluations.grfns_2023": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRFNS-2023" },
167
+ "evaluations.grmultifin": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRMultiFin" },
168
+ "evaluations.dolfin": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "DOLFIN" },
169
+ "evaluations.polyfiqa_easy": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "PolyFiQA-Easy" },
170
+ "evaluations.polyfiqa_expert": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "PolyFiQA-Expert" },
171
+ "evaluations.text_average": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Text Avg" },
172
+ "evaluations.englishocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "EnglishOCR" },
173
+ "evaluations.tablebench": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TableBench" },
174
+ "evaluations.japaneseocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "JapaneseOCR" },
175
+ "evaluations.spanishocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "SpanishOCR" },
176
+ "evaluations.greekocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GreekOCR" },
177
+ "evaluations.vision_average": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Vision Avg" },
178
+ "evaluations.mdrm_test": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "MDRM-test" },
179
+ "evaluations.finaudiosum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinAudioSum" },
180
+ "evaluations.audio_average": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Audio Avg" },
181
+ "evaluations.modality_balanced": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Modal. Avg" },
182
  },
183
  MODEL_INFO: {
184
  "metadata.co2_cost": {
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useDataUtils.js CHANGED
@@ -6,89 +6,212 @@ import {
6
  } from "../utils/searchUtils";
7
  import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
8
 
9
- // 硬编码数据集
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  const HARDCODED_SCORES = {
11
- vision: {
12
- "GPT-4o": 55.54, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.27,
13
- "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 14.97, "google/gemma-3-27b-it": 25.57,
14
- "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 24.97, "TheFinAI/finma-7b-full": 0.00,
15
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
16
- "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "Qwen-VL-MAX": 18.47,
17
- "LLaVA-1.6 Vicuna-13B": 19.77, "Deepseek-VL-7B-Chat": 19.10, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
18
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
19
- },
20
- audio: {
21
- "GPT-4o": 55.56, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00,
22
- "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00,
23
- "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 48.22, "TheFinAI/finma-7b-full": 0.00,
24
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
25
- "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "Qwen-VL-MAX": 0.00,
26
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 51.58, "Qwen2-Audio-7B": 48.02,
27
- "Qwen2-Audio-7B-Instruct": 50.06, "SALMONN-7B": 24.24, "SALMONN-13B": 24.59
28
- },
29
- english: {
30
- "GPT-4o": 42.18, "o3-Mini": 20.20, "Deepseek-V3": 18.04, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 24.16,
31
- "meta-llama/Llama-3.1-70B-Instruct": 38.71, "google/gemma-3-4b-it": 16.13, "google/gemma-3-27b-it": 17.19,
32
- "Qwen/Qwen2.5-32B-Instruct": 32.01, "Qwen/Qwen2.5-Omni-7B": 24.99, "TheFinAI/finma-7b-full": 28.89,
33
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 29.39, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 26.38,
34
- "TheFinAI/FinMA-ES-Bilingual": 31.72, "TheFinAI/plutus-8B-instruct": 27.82, "Qwen-VL-MAX": 0.00,
35
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
36
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
37
- },
38
- chinese: {
39
- "GPT-4o": 60.34, "o3-Mini": 0.00, "Deepseek-V3": 60.94, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 64.51,
40
- "meta-llama/Llama-3.1-70B-Instruct": 56.74, "google/gemma-3-4b-it": 26.23, "google/gemma-3-27b-it": 26.24,
41
- "Qwen/Qwen2.5-32B-Instruct": 56.62, "Qwen/Qwen2.5-Omni-7B": 53.09, "TheFinAI/finma-7b-full": 24.42,
42
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 23.04, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 13.18,
43
- "TheFinAI/FinMA-ES-Bilingual": 21.50, "TheFinAI/plutus-8B-instruct": 31.04, "Qwen-VL-MAX": 0.00,
44
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
45
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
46
- },
47
- japanese: {
48
- "GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.43,
49
- "meta-llama/Llama-3.1-70B-Instruct": 32.17, "google/gemma-3-4b-it": 8.98, "google/gemma-3-27b-it": 23.96,
50
- "Qwen/Qwen2.5-32B-Instruct": 4.54, "Qwen/Qwen2.5-Omni-7B": 44.35, "TheFinAI/finma-7b-full": 46.94,
51
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 47.59, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 23.96,
52
- "TheFinAI/FinMA-ES-Bilingual": 57.36, "TheFinAI/plutus-8B-instruct": 34.62, "Qwen-VL-MAX": 0.00,
53
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
54
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
55
- },
56
- spanish: {
57
- "GPT-4o": 29.80, "o3-Mini": 4.53, "Deepseek-V3": 25.49, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 47.90,
58
- "meta-llama/Llama-3.1-70B-Instruct": 37.84, "google/gemma-3-4b-it": 27.66, "google/gemma-3-27b-it": 27.77,
59
- "Qwen/Qwen2.5-32B-Instruct": 37.47, "Qwen/Qwen2.5-Omni-7B": 39.16, "TheFinAI/finma-7b-full": 27.04,
60
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 42.86, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 28.01,
61
- "TheFinAI/FinMA-ES-Bilingual": 38.69, "TheFinAI/plutus-8B-instruct": 40.16, "Qwen-VL-MAX": 0.00,
62
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
63
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
64
- },
65
- greek: {
66
- "GPT-4o": 43.04, "o3-Mini": 9.48, "Deepseek-V3": 39.07, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.95,
67
- "meta-llama/Llama-3.1-70B-Instruct": 43.60, "google/gemma-3-4b-it": 15.45, "google/gemma-3-27b-it": 15.44,
68
- "Qwen/Qwen2.5-32B-Instruct": 44.32, "Qwen/Qwen2.5-Omni-7B": 23.45, "TheFinAI/finma-7b-full": 17.93,
69
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 29.49, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 20.91,
70
- "TheFinAI/FinMA-ES-Bilingual": 15.47, "TheFinAI/plutus-8B-instruct": 60.19, "Qwen-VL-MAX": 0.00,
71
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
72
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
73
- },
74
- bilingual: {
75
- "GPT-4o": 92.29, "o3-Mini": 90.13, "Deepseek-V3": 86.26, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 89.17,
76
- "meta-llama/Llama-3.1-70B-Instruct": 92.13, "google/gemma-3-4b-it": 35.92, "google/gemma-3-27b-it": 35.92,
77
- "Qwen/Qwen2.5-32B-Instruct": 92.29, "Qwen/Qwen2.5-Omni-7B": 91.80, "TheFinAI/finma-7b-full": 69.24,
78
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 91.60, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 71.81,
79
- "TheFinAI/FinMA-ES-Bilingual": 66.57, "TheFinAI/plutus-8B-instruct": 91.59, "Qwen-VL-MAX": 0.00,
80
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
81
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
82
- },
83
- multilingual: {
84
- "GPT-4o": 6.53, "o3-Mini": 7.80, "Deepseek-V3": 36.99, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 13.52,
85
- "meta-llama/Llama-3.1-70B-Instruct": 21.97, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00,
86
- "Qwen/Qwen2.5-32B-Instruct": 18.48, "Qwen/Qwen2.5-Omni-7B": 16.29, "TheFinAI/finma-7b-full": 3.10,
87
- "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 1.76, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 10.25,
88
- "TheFinAI/FinMA-ES-Bilingual": 0.35, "TheFinAI/plutus-8B-instruct": 7.24, "Qwen-VL-MAX": 0.00,
89
- "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
90
- "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  };
93
 
94
  // Calculate min/max averages
@@ -127,41 +250,24 @@ export const useProcessedData = (data, averageMode, visibleColumns) => {
127
  // 直接使用硬编码数据创建模型列表
128
  const modelList = [];
129
 
130
- // 从HARDCODED_SCORES中获取所有模型名称
131
- const modelNames = new Set();
132
- Object.values(HARDCODED_SCORES).forEach(categoryData => {
133
- Object.entries(categoryData).forEach(([modelName, score]) => {
134
- // 添加所有模型,不管分数是否为0
135
- modelNames.add(modelName);
136
- });
137
- });
138
-
139
- // 为每个模型创建条目
140
- Array.from(modelNames).forEach((modelName, index) => {
141
- // 创建硬编码评估数据
142
- const hardcodedEvaluations = {
143
- vision_average: getHardcodedScore(modelName, 'vision'),
144
- audio_average: getHardcodedScore(modelName, 'audio'),
145
- english_average: getHardcodedScore(modelName, 'english'),
146
- chinese_average: getHardcodedScore(modelName, 'chinese'),
147
- japanese_average: getHardcodedScore(modelName, 'japanese'),
148
- spanish_average: getHardcodedScore(modelName, 'spanish'),
149
- greek_average: getHardcodedScore(modelName, 'greek'),
150
- bilingual_average: getHardcodedScore(modelName, 'bilingual'),
151
- multilingual_average: getHardcodedScore(modelName, 'multilingual')
152
- };
153
-
154
- // 计算总平均分(包含分数为0的类别)
155
- const scores = Object.values(hardcodedEvaluations).filter(score => score !== null);
156
- const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
157
-
158
- // 创建模型数据
159
  modelList.push({
160
  id: `model-${index}`,
161
  model: {
162
  name: modelName,
163
  average_score: averageScore,
164
- type: "chat", // 统一设为chat类型
 
165
  },
166
  evaluations: hardcodedEvaluations,
167
  features: {
 
6
  } from "../utils/searchUtils";
7
  import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
8
 
9
+ // MultiFinBen Table 3 hardcoded scores
10
+ const MODEL_TYPES_MAP = {
11
+ "GPT-4o": "chat", "o3-Mini": "chat", "Deepseek-V3": "chat",
12
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": "chat",
13
+ "meta-llama/Llama-3.1-70B-Instruct": "chat",
14
+ "google/gemma-3-4b-it": "chat", "google/gemma-3-27b-it": "chat",
15
+ "Qwen/Qwen2.5-32B-Instruct": "chat", "Qwen/Qwen2.5-Omni-7B": "multimodal",
16
+ "TheFinAI/finma-7b-full": "fine-tuned",
17
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": "chat",
18
+ "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": "fine-tuned",
19
+ "TheFinAI/FinMA-ES-Bilingual": "fine-tuned", "TheFinAI/plutus-8B-instruct": "fine-tuned",
20
+ "LLaVA-1.6 Vicuna-13B": "multimodal", "Deepseek-VL-7B-Chat": "multimodal",
21
+ "Whisper-V3": "multimodal", "Qwen2-Audio-7B": "multimodal",
22
+ "Qwen2-Audio-7B-Instruct": "multimodal", "SALMONN-7B": "multimodal", "SALMONN-13B": "multimodal"
23
+ };
24
+
25
+ // Openness classifications per Model Openness Framework (isitopen.ai)
26
+ // GPT-4o and o3-Mini are closed proprietary models (API only, no weights released)
27
+ // All other models have publicly released weights → Class III - Open Model
28
+ const MODEL_OPENNESS_MAP = {
29
+ "GPT-4o": "Closed", "o3-Mini": "Closed",
30
+ "Deepseek-V3": "Class III - Open Model",
31
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": "Class III - Open Model",
32
+ "meta-llama/Llama-3.1-70B-Instruct": "Class III - Open Model",
33
+ "google/gemma-3-4b-it": "Class III - Open Model",
34
+ "google/gemma-3-27b-it": "Class III - Open Model",
35
+ "Qwen/Qwen2.5-32B-Instruct": "Class III - Open Model",
36
+ "Qwen/Qwen2.5-Omni-7B": "Class III - Open Model",
37
+ "TheFinAI/finma-7b-full": "Class III - Open Model",
38
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": "Class III - Open Model",
39
+ "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": "Class III - Open Model",
40
+ "TheFinAI/FinMA-ES-Bilingual": "Class III - Open Model",
41
+ "TheFinAI/plutus-8B-instruct": "Class III - Open Model",
42
+ "LLaVA-1.6 Vicuna-13B": "Class III - Open Model",
43
+ "Deepseek-VL-7B-Chat": "Class III - Open Model",
44
+ "Whisper-V3": "Class III - Open Model",
45
+ "Qwen2-Audio-7B": "Class III - Open Model",
46
+ "Qwen2-Audio-7B-Instruct": "Class III - Open Model",
47
+ "SALMONN-7B": "Class III - Open Model",
48
+ "SALMONN-13B": "Class III - Open Model",
49
+ };
50
+
51
+ const MODEL_NAMES = [
52
+ "GPT-4o", "o3-Mini", "Deepseek-V3",
53
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
54
+ "meta-llama/Llama-3.1-70B-Instruct",
55
+ "google/gemma-3-4b-it", "google/gemma-3-27b-it",
56
+ "Qwen/Qwen2.5-32B-Instruct", "Qwen/Qwen2.5-Omni-7B",
57
+ "TheFinAI/finma-7b-full",
58
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview",
59
+ "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese",
60
+ "TheFinAI/FinMA-ES-Bilingual", "TheFinAI/plutus-8B-instruct",
61
+ "LLaVA-1.6 Vicuna-13B", "Deepseek-VL-7B-Chat",
62
+ "Whisper-V3", "Qwen2-Audio-7B",
63
+ "Qwen2-Audio-7B-Instruct", "SALMONN-7B", "SALMONN-13B"
64
+ ];
65
+
66
+ // Text EN, ZH, JA, ES, EL, BI, MU datasets + averages; Vision + Audio datasets + averages; Modality-Balanced
67
+ export const DATASET_KEYS = [
68
+ // Text - English
69
+ 'sc', 'finred', 'finer_ord', 'headlines', 'tsa_en', 'xbrl_math',
70
+ 'finqa', 'tatqa', 'ectsum', 'edtsum', 'ccf', 'bigdata22', 'msft',
71
+ // Text - Chinese
72
+ 'rre', 'aie', 'lne', 'financialiq',
73
+ // Text - Japanese
74
+ 'chabsa',
75
+ // Text - Spanish
76
+ 'multifin', 'tsa_es', 'efpa', 'fns_2023',
77
+ // Text - Greek
78
+ 'grfinnum', 'grfinqa', 'grfns_2023', 'grmultifin',
79
+ // Text - Bilingual
80
+ 'dolfin',
81
+ // Text - Multilingual
82
+ 'polyfiqa_easy', 'polyfiqa_expert',
83
+ // Text Average
84
+ 'text_average',
85
+ // Vision
86
+ 'englishocr', 'tablebench', 'japaneseocr', 'spanishocr', 'greekocr',
87
+ 'vision_average',
88
+ // Audio
89
+ 'mdrm_test', 'finaudiosum', 'audio_average',
90
+ // Modality-Balanced Average
91
+ 'modality_balanced',
92
+ ];
93
+
94
  const HARDCODED_SCORES = {
95
+ sc: {
96
+ "GPT-4o": 88.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 20.73, "meta-llama/Llama-3.1-70B-Instruct": 87.00, "google/gemma-3-4b-it": 0.69, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 22.28, "Qwen/Qwen2.5-Omni-7B": 18.61, "TheFinAI/finma-7b-full": 56.62, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 24.09, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 15.92, "TheFinAI/FinMA-ES-Bilingual": 52.70, "TheFinAI/plutus-8B-instruct": 19.79, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
97
+ },
98
+ finred: {
99
+ "GPT-4o": 3.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.37, "Qwen/Qwen2.5-Omni-7B": 0.09, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.75, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
100
+ },
101
+ finer_ord: {
102
+ "GPT-4o": 78.00, "o3-Mini": 9.58, "Deepseek-V3": 0.18, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 2.23, "meta-llama/Llama-3.1-70B-Instruct": 18.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 28.30, "Qwen/Qwen2.5-Omni-7B": 8.30, "TheFinAI/finma-7b-full": 0.04, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 5.35, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
103
+ },
104
+ headlines: {
105
+ "GPT-4o": 78.00, "o3-Mini": 0.00, "Deepseek-V3": 47.32, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 84.33, "meta-llama/Llama-3.1-70B-Instruct": 60.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 85.42, "Qwen/Qwen2.5-Omni-7B": 82.23, "TheFinAI/finma-7b-full": 97.08, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 85.10, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 82.21, "TheFinAI/FinMA-ES-Bilingual": 94.69, "TheFinAI/plutus-8B-instruct": 71.14, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
106
+ },
107
+ tsa_en: {
108
+ "GPT-4o": 61.00, "o3-Mini": 0.00, "Deepseek-V3": 0.85, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 63.82, "meta-llama/Llama-3.1-70B-Instruct": 42.00, "google/gemma-3-4b-it": 32.34, "google/gemma-3-27b-it": 32.34, "Qwen/Qwen2.5-32B-Instruct": 42.98, "Qwen/Qwen2.5-Omni-7B": 43.40, "TheFinAI/finma-7b-full": 81.70, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 85.11, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 60.00, "TheFinAI/FinMA-ES-Bilingual": 86.38, "TheFinAI/plutus-8B-instruct": 54.89, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
109
+ },
110
+ xbrl_math: {
111
+ "GPT-4o": 68.00, "o3-Mini": 68.89, "Deepseek-V3": 67.78, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 27.78, "meta-llama/Llama-3.1-70B-Instruct": 62.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 11.11, "Qwen/Qwen2.5-32B-Instruct": 64.44, "Qwen/Qwen2.5-Omni-7B": 44.44, "TheFinAI/finma-7b-full": 7.78, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 3.33, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 2.22, "TheFinAI/FinMA-ES-Bilingual": 6.67, "TheFinAI/plutus-8B-instruct": 14.44, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
112
+ },
113
+ finqa: {
114
+ "GPT-4o": 5.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 7.41, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 1.22, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
115
+ },
116
+ tatqa: {
117
+ "GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.36, "meta-llama/Llama-3.1-70B-Instruct": 44.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.05, "Qwen/Qwen2.5-Omni-7B": 1.73, "TheFinAI/finma-7b-full": 4.14, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 15.16, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
118
+ },
119
+ ectsum: {
120
+ "GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
121
+ },
122
+ edtsum: {
123
+ "GPT-4o": 25.00, "o3-Mini": 19.13, "Deepseek-V3": 16.80, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.59, "meta-llama/Llama-3.1-70B-Instruct": 18.00, "google/gemma-3-4b-it": 0.98, "google/gemma-3-27b-it": 0.10, "Qwen/Qwen2.5-32B-Instruct": 20.16, "Qwen/Qwen2.5-Omni-7B": 23.89, "TheFinAI/finma-7b-full": 19.92, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 12.49, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 8.06, "TheFinAI/FinMA-ES-Bilingual": 2.06, "TheFinAI/plutus-8B-instruct": 13.61, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
124
+ },
125
+ ccf: {
126
+ "GPT-4o": 52.50, "o3-Mini": 50.00, "Deepseek-V3": 50.62, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 51.34, "meta-llama/Llama-3.1-70B-Instruct": 50.00, "google/gemma-3-4b-it": 50.93, "google/gemma-3-27b-it": 50.00, "Qwen/Qwen2.5-32B-Instruct": 52.94, "Qwen/Qwen2.5-Omni-7B": 50.31, "TheFinAI/finma-7b-full": 50.05, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 50.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 50.00, "TheFinAI/FinMA-ES-Bilingual": 51.18, "TheFinAI/plutus-8B-instruct": 50.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
127
+ },
128
+ bigdata22: {
129
+ "GPT-4o": 48.50, "o3-Mini": 50.00, "Deepseek-V3": 50.93, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 46.91, "meta-llama/Llama-3.1-70B-Instruct": 50.00, "google/gemma-3-4b-it": 50.75, "google/gemma-3-27b-it": 50.00, "Qwen/Qwen2.5-32B-Instruct": 49.89, "Qwen/Qwen2.5-Omni-7B": 51.82, "TheFinAI/finma-7b-full": 50.80, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 53.12, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 50.00, "TheFinAI/FinMA-ES-Bilingual": 52.12, "TheFinAI/plutus-8B-instruct": 50.26, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
130
+ },
131
+ msft: {
132
+ "GPT-4o": 41.32, "o3-Mini": 65.06, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 72.25, "google/gemma-3-4b-it": 74.03, "google/gemma-3-27b-it": 79.97, "Qwen/Qwen2.5-32B-Instruct": 49.32, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 68.81, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 74.50, "TheFinAI/FinMA-ES-Bilingual": 66.53, "TheFinAI/plutus-8B-instruct": 65.10, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
133
+ },
134
+ rre: {
135
+ "GPT-4o": 63.25, "o3-Mini": 0.00, "Deepseek-V3": 67.52, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 54.70, "meta-llama/Llama-3.1-70B-Instruct": 46.15, "google/gemma-3-4b-it": 36.75, "google/gemma-3-27b-it": 36.75, "Qwen/Qwen2.5-32B-Instruct": 8.55, "Qwen/Qwen2.5-Omni-7B": 7.69, "TheFinAI/finma-7b-full": 0.85, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 2.56, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.85, "TheFinAI/FinMA-ES-Bilingual": 0.85, "TheFinAI/plutus-8B-instruct": 2.56, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
136
+ },
137
+ aie: {
138
+ "GPT-4o": 82.26, "o3-Mini": 0.00, "Deepseek-V3": 82.01, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 80.99, "meta-llama/Llama-3.1-70B-Instruct": 76.80, "google/gemma-3-4b-it": 33.82, "google/gemma-3-27b-it": 33.82, "Qwen/Qwen2.5-32B-Instruct": 83.03, "Qwen/Qwen2.5-Omni-7B": 80.17, "TheFinAI/finma-7b-full": 40.81, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 10.04, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 4.32, "TheFinAI/FinMA-ES-Bilingual": 21.55, "TheFinAI/plutus-8B-instruct": 54.48, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
139
+ },
140
+ lne: {
141
+ "GPT-4o": 63.30, "o3-Mini": 0.00, "Deepseek-V3": 58.72, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 55.50, "meta-llama/Llama-3.1-70B-Instruct": 41.28, "google/gemma-3-4b-it": 9.17, "google/gemma-3-27b-it": 9.17, "Qwen/Qwen2.5-32B-Instruct": 57.80, "Qwen/Qwen2.5-Omni-7B": 59.17, "TheFinAI/finma-7b-full": 29.82, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 22.48, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 12.84, "TheFinAI/FinMA-ES-Bilingual": 32.11, "TheFinAI/plutus-8B-instruct": 26.61, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
142
+ },
143
+ financialiq: {
144
+ "GPT-4o": 32.53, "o3-Mini": 0.00, "Deepseek-V3": 35.52, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 66.83, "meta-llama/Llama-3.1-70B-Instruct": 62.71, "google/gemma-3-4b-it": 25.19, "google/gemma-3-27b-it": 25.20, "Qwen/Qwen2.5-32B-Instruct": 77.09, "Qwen/Qwen2.5-Omni-7B": 65.32, "TheFinAI/finma-7b-full": 26.21, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 57.07, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 34.70, "TheFinAI/FinMA-ES-Bilingual": 31.48, "TheFinAI/plutus-8B-instruct": 40.52, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
145
+ },
146
+ chabsa: {
147
+ "GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.43, "meta-llama/Llama-3.1-70B-Instruct": 32.17, "google/gemma-3-4b-it": 8.98, "google/gemma-3-27b-it": 23.96, "Qwen/Qwen2.5-32B-Instruct": 4.54, "Qwen/Qwen2.5-Omni-7B": 44.35, "TheFinAI/finma-7b-full": 46.94, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 47.59, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 23.96, "TheFinAI/FinMA-ES-Bilingual": 57.36, "TheFinAI/plutus-8B-instruct": 34.62, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
148
+ },
149
+ multifin: {
150
+ "GPT-4o": 61.74, "o3-Mini": 0.00, "Deepseek-V3": 53.91, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 62.17, "meta-llama/Llama-3.1-70B-Instruct": 48.26, "google/gemma-3-4b-it": 22.17, "google/gemma-3-27b-it": 22.17, "Qwen/Qwen2.5-32B-Instruct": 46.52, "Qwen/Qwen2.5-Omni-7B": 46.96, "TheFinAI/finma-7b-full": 43.04, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 31.74, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 12.61, "TheFinAI/FinMA-ES-Bilingual": 44.78, "TheFinAI/plutus-8B-instruct": 51.30, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
151
+ },
152
+ tsa_es: {
153
+ "GPT-4o": 0.39, "o3-Mini": 0.00, "Deepseek-V3": 29.17, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 52.29, "meta-llama/Llama-3.1-70B-Instruct": 24.29, "google/gemma-3-4b-it": 63.04, "google/gemma-3-27b-it": 63.46, "Qwen/Qwen2.5-32B-Instruct": 31.63, "Qwen/Qwen2.5-Omni-7B": 46.46, "TheFinAI/finma-7b-full": 31.03, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 68.19, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 63.38, "TheFinAI/FinMA-ES-Bilingual": 16.64, "TheFinAI/plutus-8B-instruct": 51.82, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
154
+ },
155
+ efpa: {
156
+ "GPT-4o": 31.14, "o3-Mini": 0.00, "Deepseek-V3": 18.86, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 67.54, "meta-llama/Llama-3.1-70B-Instruct": 66.67, "google/gemma-3-4b-it": 25.44, "google/gemma-3-27b-it": 25.44, "Qwen/Qwen2.5-32B-Instruct": 65.79, "Qwen/Qwen2.5-Omni-7B": 55.70, "TheFinAI/finma-7b-full": 32.46, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 65.79, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 25.44, "TheFinAI/FinMA-ES-Bilingual": 91.67, "TheFinAI/plutus-8B-instruct": 48.25, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
157
+ },
158
+ fns_2023: {
159
+ "GPT-4o": 25.94, "o3-Mini": 18.11, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 9.61, "meta-llama/Llama-3.1-70B-Instruct": 12.14, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 5.93, "Qwen/Qwen2.5-Omni-7B": 7.50, "TheFinAI/finma-7b-full": 1.64, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 5.71, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 10.62, "TheFinAI/FinMA-ES-Bilingual": 1.65, "TheFinAI/plutus-8B-instruct": 9.27, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
160
+ },
161
+ grfinnum: {
162
+ "GPT-4o": 9.18, "o3-Mini": 20.98, "Deepseek-V3": 7.43, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 49.12, "meta-llama/Llama-3.1-70B-Instruct": 46.34, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 36.77, "Qwen/Qwen2.5-Omni-7B": 0.40, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 70.06, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
163
+ },
164
+ grfinqa: {
165
+ "GPT-4o": 78.22, "o3-Mini": 0.00, "Deepseek-V3": 50.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 74.22, "meta-llama/Llama-3.1-70B-Instruct": 64.44, "google/gemma-3-4b-it": 22.67, "google/gemma-3-27b-it": 22.67, "Qwen/Qwen2.5-32B-Instruct": 60.44, "Qwen/Qwen2.5-Omni-7B": 48.89, "TheFinAI/finma-7b-full": 25.33, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 57.78, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 28.44, "TheFinAI/FinMA-ES-Bilingual": 23.11, "TheFinAI/plutus-8B-instruct": 64.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
166
+ },
167
+ grfns_2023: {
168
+ "GPT-4o": 25.50, "o3-Mini": 16.95, "Deepseek-V3": 37.72, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.90, "meta-llama/Llama-3.1-70B-Instruct": 13.61, "google/gemma-3-4b-it": 0.24, "google/gemma-3-27b-it": 0.21, "Qwen/Qwen2.5-32B-Instruct": 9.71, "Qwen/Qwen2.5-Omni-7B": 5.60, "TheFinAI/finma-7b-full": 11.20, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 6.48, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 14.45, "TheFinAI/FinMA-ES-Bilingual": 3.56, "TheFinAI/plutus-8B-instruct": 34.46, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
169
+ },
170
+ grmultifin: {
171
+ "GPT-4o": 59.26, "o3-Mini": 0.00, "Deepseek-V3": 61.11, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 55.56, "meta-llama/Llama-3.1-70B-Instruct": 50.00, "google/gemma-3-4b-it": 38.89, "google/gemma-3-27b-it": 38.89, "Qwen/Qwen2.5-32B-Instruct": 70.37, "Qwen/Qwen2.5-Omni-7B": 38.89, "TheFinAI/finma-7b-full": 35.19, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 53.70, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 40.74, "TheFinAI/FinMA-ES-Bilingual": 35.19, "TheFinAI/plutus-8B-instruct": 72.22, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
172
+ },
173
+ dolfin: {
174
+ "GPT-4o": 92.29, "o3-Mini": 90.13, "Deepseek-V3": 86.26, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 89.17, "meta-llama/Llama-3.1-70B-Instruct": 92.13, "google/gemma-3-4b-it": 35.92, "google/gemma-3-27b-it": 35.92, "Qwen/Qwen2.5-32B-Instruct": 92.29, "Qwen/Qwen2.5-Omni-7B": 91.80, "TheFinAI/finma-7b-full": 69.24, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 91.60, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 71.81, "TheFinAI/FinMA-ES-Bilingual": 66.57, "TheFinAI/plutus-8B-instruct": 91.59, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
175
+ },
176
+ polyfiqa_easy: {
177
+ "GPT-4o": 9.79, "o3-Mini": 9.56, "Deepseek-V3": 34.72, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 27.73, "meta-llama/Llama-3.1-70B-Instruct": 25.04, "google/gemma-3-4b-it": 15.02, "google/gemma-3-27b-it": 14.74, "Qwen/Qwen2.5-32B-Instruct": 19.34, "Qwen/Qwen2.5-Omni-7B": 18.81, "TheFinAI/finma-7b-full": 2.44, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 2.40, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 11.63, "TheFinAI/FinMA-ES-Bilingual": 0.63, "TheFinAI/plutus-8B-instruct": 7.06, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
178
+ },
179
+ polyfiqa_expert: {
180
+ "GPT-4o": 5.31, "o3-Mini": 4.85, "Deepseek-V3": 30.35, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 20.60, "meta-llama/Llama-3.1-70B-Instruct": 18.56, "google/gemma-3-4b-it": 13.83, "google/gemma-3-27b-it": 16.01, "Qwen/Qwen2.5-32B-Instruct": 18.17, "Qwen/Qwen2.5-Omni-7B": 16.35, "TheFinAI/finma-7b-full": 6.38, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.71, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 8.80, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 9.87, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
181
+ },
182
+ text_average: {
183
+ "GPT-4o": 40.98, "o3-Mini": 14.59, "Deepseek-V3": 30.61, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 39.50, "meta-llama/Llama-3.1-70B-Instruct": 42.20, "google/gemma-3-4b-it": 19.34, "google/gemma-3-27b-it": 20.41, "Qwen/Qwen2.5-32B-Instruct": 38.07, "Qwen/Qwen2.5-Omni-7B": 33.06, "TheFinAI/finma-7b-full": 26.83, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 31.24, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 24.40, "TheFinAI/FinMA-ES-Bilingual": 28.95, "TheFinAI/plutus-8B-instruct": 35.53, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
184
+ },
185
+ englishocr: {
186
+ "GPT-4o": 21.38, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 12.39, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 10.70, "google/gemma-3-27b-it": 11.40, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
187
+ },
188
+ tablebench: {
189
+ "GPT-4o": 66.70, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 32.30, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 28.60, "google/gemma-3-27b-it": 60.90, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 74.90, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 59.30, "Deepseek-VL-7B-Chat": 57.30, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
190
+ },
191
+ japaneseocr: {
192
+ "GPT-4o": 21.63, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 24.52, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 25.82, "google/gemma-3-27b-it": 26.72, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 21.59, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 9.70, "TheFinAI/plutus-8B-instruct": 6.62, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
193
+ },
194
+ spanishocr: {
195
+ "GPT-4o": 78.55, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 4.12, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 5.60, "google/gemma-3-27b-it": 4.40, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
196
+ },
197
+ greekocr: {
198
+ "GPT-4o": 41.86, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 42.51, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 23.69, "google/gemma-3-27b-it": 30.60, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 8.25, "TheFinAI/plutus-8B-instruct": 6.80, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
199
+ },
200
+ vision_average: {
201
+ "GPT-4o": 46.02, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 23.17, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 18.88, "google/gemma-3-27b-it": 26.80, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 19.30, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 15.45, "Deepseek-VL-7B-Chat": 14.14, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
202
+ },
203
+ mdrm_test: {
204
+ "GPT-4o": 95.77, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 96.43, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 97.86, "Qwen2-Audio-7B": 96.03, "Qwen2-Audio-7B-Instruct": 95.32, "SALMONN-7B": 48.48, "SALMONN-13B": 49.17
205
+ },
206
+ finaudiosum: {
207
+ "GPT-4o": 6.30, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 5.30, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 4.80, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
208
+ },
209
+ audio_average: {
210
+ "GPT-4o": 51.04, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 48.22, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 51.58, "Qwen2-Audio-7B": 48.02, "Qwen2-Audio-7B-Instruct": 50.06, "SALMONN-7B": 24.24, "SALMONN-13B": 24.59
211
+ },
212
+ modality_balanced: {
213
+ "GPT-4o": 46.01, "o3-Mini": 4.86, "Deepseek-V3": 10.20, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 20.89, "meta-llama/Llama-3.1-70B-Instruct": 14.07, "google/gemma-3-4b-it": 12.74, "google/gemma-3-27b-it": 15.74, "Qwen/Qwen2.5-32B-Instruct": 12.69, "Qwen/Qwen2.5-Omni-7B": 33.53, "TheFinAI/finma-7b-full": 8.94, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 10.41, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 8.13, "TheFinAI/FinMA-ES-Bilingual": 9.65, "TheFinAI/plutus-8B-instruct": 11.84, "LLaVA-1.6 Vicuna-13B": 5.15, "Deepseek-VL-7B-Chat": 4.71, "Whisper-V3": 17.19, "Qwen2-Audio-7B": 16.01, "Qwen2-Audio-7B-Instruct": 16.69, "SALMONN-7B": 8.08, "SALMONN-13B": 8.20
214
+ },
215
  };
216
 
217
  // Calculate min/max averages
 
250
  // 直接使用硬编码数据创建模型列表
251
  const modelList = [];
252
 
253
+ // Use modality_balanced as the canonical model list
254
+ const modelNames = Object.keys(HARDCODED_SCORES.modality_balanced);
255
+
256
+ modelNames.forEach((modelName, index) => {
257
+ const hardcodedEvaluations = Object.fromEntries(
258
+ DATASET_KEYS.map(key => [key, getHardcodedScore(modelName, key)])
259
+ );
260
+
261
+ // Use modality-balanced average from Table 3 directly
262
+ const averageScore = getHardcodedScore(modelName, 'modality_balanced');
263
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  modelList.push({
265
  id: `model-${index}`,
266
  model: {
267
  name: modelName,
268
  average_score: averageScore,
269
+ type: MODEL_TYPES_MAP[modelName] || "chat",
270
+ openness: MODEL_OPENNESS_MAP[modelName] || "Class III - Open Model",
271
  },
272
  evaluations: hardcodedEvaluations,
273
  features: {
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED
@@ -440,7 +440,7 @@ const BooleanValue = ({ value }) => {
440
  );
441
  };
442
 
443
- // Greek Financial LLM Leaderboard创建自定义标题组件
444
  const createGreekLeaderboardHeader = (header) => (
445
  <Box
446
  className="header-content"
@@ -452,8 +452,8 @@ const createGreekLeaderboardHeader = (header) => (
452
  }}
453
  >
454
  <HeaderLabel
455
- label="Greek Financial LLM Leaderboard"
456
- tooltip="Average performance on Greek financial tasks"
457
  className="header-label"
458
  isSorted={header?.column?.getIsSorted()}
459
  />
@@ -467,34 +467,7 @@ const createGreekLeaderboardHeader = (header) => (
467
  flexShrink: 0,
468
  }}
469
  >
470
- <InfoIcon tooltip="Average performance on Greek financial tasks" />
471
- <Link
472
- href="https://huggingface.co/spaces/TheFinAI/Open-Greek-Financial-LLM-Leaderboard#/"
473
- target="_blank"
474
- rel="noopener noreferrer"
475
- aria-label="View Greek Financial LLM Leaderboard"
476
- sx={{
477
- color: "info.main",
478
- display: "flex",
479
- alignItems: "center",
480
- ml: 0.5,
481
- textDecoration: "none",
482
- "&:hover": {
483
- textDecoration: "underline",
484
- "& svg": {
485
- opacity: 0.8,
486
- },
487
- },
488
- }}
489
- >
490
- <OpenInNewIcon
491
- sx={{
492
- fontSize: "1rem",
493
- opacity: 0.6,
494
- transition: "opacity 0.2s ease-in-out",
495
- }}
496
- />
497
- </Link>
498
  </Box>
499
  </Box>
500
  );
@@ -511,7 +484,7 @@ const createLeaderboardHeader = (label, tooltip, linkUrl) => (header) => (
511
  }}
512
  >
513
  <HeaderLabel
514
- label={`${label} Leaderboard`}
515
  tooltip={tooltip}
516
  className="header-label"
517
  isSorted={header?.column?.getIsSorted()}
@@ -873,6 +846,8 @@ export const createColumns = (
873
  accessorKey: "model.openness",
874
  header: createHeaderCell("Openness", "Model openness classification"),
875
  cell: ({ row }) => {
 
 
876
  return (
877
  <Box
878
  sx={{
@@ -881,8 +856,11 @@ export const createColumns = (
881
  alignItems: "center",
882
  }}
883
  >
884
- <Typography variant="body2">
885
- Class III-Open Model
 
 
 
886
  </Typography>
887
  </Box>
888
  );
@@ -1011,162 +989,63 @@ export const createColumns = (
1011
  );
1012
  };
1013
 
1014
- const evaluationColumns = [
1015
- {
1016
- accessorKey: "evaluations.greek_average",
1017
- header: createGreekLeaderboardHeader,
1018
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.greek_average"),
1019
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1020
- meta: {
1021
- headerStyle: {
1022
- backgroundColor: (theme) => alpha(theme.palette.info.light, 0.05),
1023
- },
1024
- cellStyle: (value) => ({
1025
- position: "relative",
1026
- overflow: "hidden",
1027
- padding: "8px 16px",
1028
- backgroundColor: (theme) => alpha(theme.palette.info.light, 0.05),
1029
- }),
1030
- },
1031
- },
1032
- {
1033
- accessorKey: "evaluations.vision_average",
1034
- header: createLeaderboardHeader("Vision", "Average performance on vision tasks", null),
1035
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.vision_average"),
1036
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1037
- meta: {
1038
- headerStyle: {
1039
- backgroundColor: (theme) => alpha(theme.palette.primary.light, 0.05),
1040
- },
1041
- cellStyle: (value) => ({
1042
- position: "relative",
1043
- overflow: "hidden",
1044
- padding: "8px 16px",
1045
- backgroundColor: (theme) => alpha(theme.palette.primary.light, 0.05),
1046
- }),
1047
- },
1048
- },
1049
- {
1050
- accessorKey: "evaluations.audio_average",
1051
- header: createLeaderboardHeader("Audio", "Average performance on audio tasks", null),
1052
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.audio_average"),
1053
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1054
- meta: {
1055
- headerStyle: {
1056
- backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
1057
- },
1058
- cellStyle: (value) => ({
1059
- position: "relative",
1060
- overflow: "hidden",
1061
- padding: "8px 16px",
1062
- backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
1063
- }),
1064
- },
1065
- },
1066
- {
1067
- accessorKey: "evaluations.english_average",
1068
- header: createLeaderboardHeader("English", "Average performance on English language tasks", null),
1069
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.english_average"),
1070
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1071
- meta: {
1072
- headerStyle: {
1073
- backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
1074
- },
1075
- cellStyle: (value) => ({
1076
- position: "relative",
1077
- overflow: "hidden",
1078
- padding: "8px 16px",
1079
- backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
1080
- }),
1081
- },
1082
- },
1083
- {
1084
- accessorKey: "evaluations.chinese_average",
1085
- header: createLeaderboardHeader("Chinese", "Average performance on Chinese language tasks", null),
1086
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.chinese_average"),
1087
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1088
- meta: {
1089
- headerStyle: {
1090
- backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
1091
- },
1092
- cellStyle: (value) => ({
1093
- position: "relative",
1094
- overflow: "hidden",
1095
- padding: "8px 16px",
1096
- backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
1097
- }),
1098
- },
1099
- },
1100
- {
1101
- accessorKey: "evaluations.japanese_average",
1102
- header: createLeaderboardHeader("Japanese", "Average performance on Japanese language tasks", null),
1103
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.japanese_average"),
1104
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1105
- meta: {
1106
- headerStyle: {
1107
- backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
1108
- },
1109
- cellStyle: (value) => ({
1110
- position: "relative",
1111
- overflow: "hidden",
1112
- padding: "8px 16px",
1113
- backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
1114
- }),
1115
- },
1116
- },
1117
- {
1118
- accessorKey: "evaluations.spanish_average",
1119
- header: createLeaderboardHeader("Spanish", "Average performance on Spanish language tasks", null),
1120
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.spanish_average"),
1121
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1122
- meta: {
1123
- headerStyle: {
1124
- backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
1125
- },
1126
- cellStyle: (value) => ({
1127
- position: "relative",
1128
- overflow: "hidden",
1129
- padding: "8px 16px",
1130
- backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
1131
- }),
1132
- },
1133
- },
1134
- {
1135
- accessorKey: "evaluations.bilingual_average",
1136
- header: createLeaderboardHeader("Bilingual", "Average performance on bilingual tasks", null),
1137
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.bilingual_average"),
1138
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1139
- meta: {
1140
- headerStyle: {
1141
- backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
1142
- },
1143
- cellStyle: (value) => ({
1144
- position: "relative",
1145
- overflow: "hidden",
1146
- padding: "8px 16px",
1147
- backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
1148
- }),
1149
- },
1150
- },
1151
- {
1152
- accessorKey: "evaluations.multilingual_average",
1153
- header: createLeaderboardHeader("Multilingual", "Average performance on multilingual tasks", null),
1154
- cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.multilingual_average"),
1155
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1156
- meta: {
1157
- headerStyle: {
1158
- backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
1159
- },
1160
- cellStyle: (value) => ({
1161
- position: "relative",
1162
- overflow: "hidden",
1163
- padding: "8px 16px",
1164
- backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
1165
- }),
1166
- },
1167
- }
1168
  ];
1169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1170
  const optionalColumns = [
1171
  {
1172
  accessorKey: "model.architecture",
 
440
  );
441
  };
442
 
443
+ // Greek (EL) language text tasks header
444
  const createGreekLeaderboardHeader = (header) => (
445
  <Box
446
  className="header-content"
 
452
  }}
453
  >
454
  <HeaderLabel
455
+ label="Greek (EL)"
456
+ tooltip="Average performance on Greek language financial tasks (GRFinNUM, GRFinQA, GRFNS-2023, GRMultiFin)"
457
  className="header-label"
458
  isSorted={header?.column?.getIsSorted()}
459
  />
 
467
  flexShrink: 0,
468
  }}
469
  >
470
+ <InfoIcon tooltip="Average performance on Greek language financial tasks (GRFinNUM, GRFinQA, GRFNS-2023, GRMultiFin)" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  </Box>
472
  </Box>
473
  );
 
484
  }}
485
  >
486
  <HeaderLabel
487
+ label={label}
488
  tooltip={tooltip}
489
  className="header-label"
490
  isSorted={header?.column?.getIsSorted()}
 
846
  accessorKey: "model.openness",
847
  header: createHeaderCell("Openness", "Model openness classification"),
848
  cell: ({ row }) => {
849
+ const openness = row.original.model.openness || "Class III - Open Model";
850
+ const isClosed = openness === "Closed";
851
  return (
852
  <Box
853
  sx={{
 
856
  alignItems: "center",
857
  }}
858
  >
859
+ <Typography
860
+ variant="body2"
861
+ sx={isClosed ? { color: "text.secondary", fontStyle: "italic" } : {}}
862
+ >
863
+ {openness}
864
  </Typography>
865
  </Box>
866
  );
 
989
  );
990
  };
991
 
992
+ const EVAL_COLUMN_DEFS = [
993
+ { key: 'sc', label: 'SC', tooltip: 'EN Text: SC (Information Extraction)' },
994
+ { key: 'finred', label: 'FinRED', tooltip: 'EN Text: FinRED (Information Extraction)' },
995
+ { key: 'finer_ord', label: 'FINER-ORD', tooltip: 'EN Text: FINER-ORD (Information Extraction)' },
996
+ { key: 'headlines', label: 'Headlines', tooltip: 'EN Text: Headlines (Text Analysis)' },
997
+ { key: 'tsa_en', label: 'TSA', tooltip: 'EN Text: TSA (Text Analysis)' },
998
+ { key: 'xbrl_math', label: 'XBRL-Math', tooltip: 'EN Text: XBRL-Math (Question Answering)' },
999
+ { key: 'finqa', label: 'FinQA', tooltip: 'EN Text: FinQA (Question Answering)' },
1000
+ { key: 'tatqa', label: 'TATQA', tooltip: 'EN Text: TATQA (Question Answering)' },
1001
+ { key: 'ectsum', label: 'ECTSUM', tooltip: 'EN Text: ECTSUM (Text Generation)' },
1002
+ { key: 'edtsum', label: 'EDTSUM', tooltip: 'EN Text: EDTSUM (Text Generation)' },
1003
+ { key: 'ccf', label: 'CCF', tooltip: 'EN Text: CCF (Ranking Modeling)' },
1004
+ { key: 'bigdata22', label: 'BigData22', tooltip: 'EN Text: BigData22 (Portfolio Optimization)' },
1005
+ { key: 'msft', label: 'MSFT', tooltip: 'EN Text: MSFT (Decision Making)' },
1006
+ { key: 'rre', label: 'RRE', tooltip: 'ZH Text: RRE (Information Extraction)' },
1007
+ { key: 'aie', label: 'AIE', tooltip: 'ZH Text: AIE (Text Analysis)' },
1008
+ { key: 'lne', label: 'LNE', tooltip: 'ZH Text: LNE (Text Analysis)' },
1009
+ { key: 'financialiq', label: 'FinancialIQ', tooltip: 'ZH Text: FinancialIQ (Question Answering)' },
1010
+ { key: 'chabsa', label: 'chabsa', tooltip: 'JA Text: chabsa (Text Analysis)' },
1011
+ { key: 'multifin', label: 'MultiFin', tooltip: 'ES Text: MultiFin (Text Analysis)' },
1012
+ { key: 'tsa_es', label: 'TSA (ES)', tooltip: 'ES Text: TSA (Text Analysis)' },
1013
+ { key: 'efpa', label: 'EFPA', tooltip: 'ES Text: EFPA (Question Answering)' },
1014
+ { key: 'fns_2023', label: 'FNS-2023', tooltip: 'ES Text: FNS-2023 (Text Generation)' },
1015
+ { key: 'grfinnum', label: 'GRFinNUM', tooltip: 'EL Text: GRFinNUM (Information Extraction)' },
1016
+ { key: 'grfinqa', label: 'GRFinQA', tooltip: 'EL Text: GRFinQA (Question Answering)' },
1017
+ { key: 'grfns_2023', label: 'GRFNS-2023', tooltip: 'EL Text: GRFNS-2023 (Text Generation)' },
1018
+ { key: 'grmultifin', label: 'GRMultiFin', tooltip: 'EL Text: GRMultiFin (Text Analysis)' },
1019
+ { key: 'dolfin', label: 'DOLFIN', tooltip: 'BI Text: DOLFIN (Text Generation)' },
1020
+ { key: 'polyfiqa_easy', label: 'PolyFiQA-Easy', tooltip: 'MU Text: PolyFiQA-Easy (Question Answering)' },
1021
+ { key: 'polyfiqa_expert', label: 'PolyFiQA-Expert', tooltip: 'MU Text: PolyFiQA-Expert (Question Answering)' },
1022
+ { key: 'text_average', label: 'Text Avg', tooltip: 'Average performance on text tasks' },
1023
+ { key: 'englishocr', label: 'EnglishOCR', tooltip: 'EN Vision: EnglishOCR (Information Extraction)' },
1024
+ { key: 'tablebench', label: 'TableBench', tooltip: 'EN Vision: TableBench (Question Answering)' },
1025
+ { key: 'japaneseocr', label: 'JapaneseOCR', tooltip: 'JA Vision: JapaneseOCR (Information Extraction)' },
1026
+ { key: 'spanishocr', label: 'SpanishOCR', tooltip: 'ES Vision: SpanishOCR (Information Extraction)' },
1027
+ { key: 'greekocr', label: 'GreekOCR', tooltip: 'EL Vision: GreekOCR (Information Extraction)' },
1028
+ { key: 'vision_average', label: 'Vision Avg', tooltip: 'Average performance on vision tasks' },
1029
+ { key: 'mdrm_test', label: 'MDRM-test', tooltip: 'EN Audio: MDRM-test (Text Generation)' },
1030
+ { key: 'finaudiosum', label: 'FinAudioSum', tooltip: 'EN Audio: FinAudioSum (Text Generation)' },
1031
+ { key: 'audio_average', label: 'Audio Avg', tooltip: 'Average performance on audio tasks' },
1032
+ { key: 'modality_balanced', label: 'Modal. Avg', tooltip: 'Modality-Balanced Average (mean of text, vision, audio averages)' },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
  ];
1034
 
1035
+ const evaluationColumns = EVAL_COLUMN_DEFS.map(({ key, label, tooltip }) => ({
1036
+ accessorKey: `evaluations.${key}`,
1037
+ header: createLeaderboardHeader(label, tooltip, null),
1038
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, `evaluations.${key}`),
1039
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1040
+ meta: {
1041
+ cellStyle: () => ({
1042
+ position: "relative",
1043
+ overflow: "hidden",
1044
+ padding: "8px 16px",
1045
+ }),
1046
+ },
1047
+ }));
1048
+
1049
  const optionalColumns = [
1050
  {
1051
  accessorKey: "model.architecture",