Replace leaderboard data with MultiFinBen Table 3 scores
Browse files- Restructure evaluation columns from language averages to individual datasets (40 columns)
- Add all 29 text datasets: EN/ZH/JA/ES/EL/BI/MU tasks + averages
- Add vision datasets: EnglishOCR, TableBench, JapaneseOCR, SpanishOCR, GreekOCR + average
- Add audio datasets: MDRM-test, FinAudioSum + average + modality-balanced average
- Fill all scores from MultiFinBen Table 3 for 21 models
- frontend/src/pages/LeaderboardPage/LeaderboardPage.js +3 -4
- frontend/src/pages/LeaderboardPage/components/Leaderboard/Leaderboard.js +3 -3
- frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/allowedModels.js +0 -1
- frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js +40 -6
- frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useDataUtils.js +218 -112
- frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js +67 -188
frontend/src/pages/LeaderboardPage/LeaderboardPage.js
CHANGED
|
@@ -11,11 +11,10 @@ function LeaderboardPage() {
|
|
| 11 |
const { actions } = useLeaderboard();
|
| 12 |
|
| 13 |
useEffect(() => {
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
}
|
| 17 |
-
actions.setLoading(isLoading);
|
| 18 |
-
actions.setError(error);
|
| 19 |
}, [data, isLoading, error, actions]);
|
| 20 |
|
| 21 |
return (
|
|
|
|
| 11 |
const { actions } = useLeaderboard();
|
| 12 |
|
| 13 |
useEffect(() => {
|
| 14 |
+
actions.setModels(data || []);
|
| 15 |
+
if (!error) {
|
| 16 |
+
actions.setLoading(isLoading);
|
| 17 |
}
|
|
|
|
|
|
|
| 18 |
}, [data, isLoading, error, actions]);
|
| 19 |
|
| 20 |
return (
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/Leaderboard.js
CHANGED
|
@@ -215,7 +215,7 @@ const Leaderboard = () => {
|
|
| 215 |
|
| 216 |
// Memoize loading states
|
| 217 |
const loadingStates = useMemo(() => {
|
| 218 |
-
const isInitialLoading = dataLoading
|
| 219 |
const isProcessingData = !memoizedTable || !memoizedFilteredData;
|
| 220 |
const isApplyingFilters = state.models.length > 0 && !memoizedFilteredData;
|
| 221 |
const hasValidFilterCounts =
|
|
@@ -367,11 +367,11 @@ const Leaderboard = () => {
|
|
| 367 |
]);
|
| 368 |
|
| 369 |
// If an error occurred, display it
|
| 370 |
-
if (
|
| 371 |
return (
|
| 372 |
<Box sx={{ p: 3, textAlign: "center" }}>
|
| 373 |
<Typography color="error">
|
| 374 |
-
{
|
| 375 |
"An error occurred while loading the data"}
|
| 376 |
</Typography>
|
| 377 |
</Box>
|
|
|
|
| 215 |
|
| 216 |
// Memoize loading states
|
| 217 |
const loadingStates = useMemo(() => {
|
| 218 |
+
const isInitialLoading = dataLoading && !memoizedFilteredData?.length;
|
| 219 |
const isProcessingData = !memoizedTable || !memoizedFilteredData;
|
| 220 |
const isApplyingFilters = state.models.length > 0 && !memoizedFilteredData;
|
| 221 |
const hasValidFilterCounts =
|
|
|
|
| 367 |
]);
|
| 368 |
|
| 369 |
// If an error occurred, display it
|
| 370 |
+
if (processingError) {
|
| 371 |
return (
|
| 372 |
<Box sx={{ p: 3, textAlign: "center" }}>
|
| 373 |
<Typography color="error">
|
| 374 |
+
{processingError?.message ||
|
| 375 |
"An error occurred while loading the data"}
|
| 376 |
</Typography>
|
| 377 |
</Box>
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/allowedModels.js
CHANGED
|
@@ -14,7 +14,6 @@ export const ALLOWED_MODELS = [
|
|
| 14 |
"cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese",
|
| 15 |
"TheFinAI/FinMA-ES-Bilingual",
|
| 16 |
"TheFinAI/plutus-8B-instruct",
|
| 17 |
-
"Qwen-VL-MAX",
|
| 18 |
"LLaVA-1.6 Vicuna-13B",
|
| 19 |
"Deepseek-VL-7B-Chat",
|
| 20 |
"Whisper-V3",
|
|
|
|
| 14 |
"cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese",
|
| 15 |
"TheFinAI/FinMA-ES-Bilingual",
|
| 16 |
"TheFinAI/plutus-8B-instruct",
|
|
|
|
| 17 |
"LLaVA-1.6 Vicuna-13B",
|
| 18 |
"Deepseek-VL-7B-Chat",
|
| 19 |
"Whisper-V3",
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js
CHANGED
|
@@ -139,12 +139,46 @@ const COLUMNS = {
|
|
| 139 |
},
|
| 140 |
},
|
| 141 |
EVALUATION: {
|
| 142 |
-
"evaluations.
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
},
|
| 149 |
MODEL_INFO: {
|
| 150 |
"metadata.co2_cost": {
|
|
|
|
| 139 |
},
|
| 140 |
},
|
| 141 |
EVALUATION: {
|
| 142 |
+
"evaluations.sc": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "SC" },
|
| 143 |
+
"evaluations.finred": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinRED" },
|
| 144 |
+
"evaluations.finer_ord": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FINER-ORD" },
|
| 145 |
+
"evaluations.headlines": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Headlines" },
|
| 146 |
+
"evaluations.tsa_en": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TSA" },
|
| 147 |
+
"evaluations.xbrl_math": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "XBRL-Math" },
|
| 148 |
+
"evaluations.finqa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinQA" },
|
| 149 |
+
"evaluations.tatqa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TATQA" },
|
| 150 |
+
"evaluations.ectsum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "ECTSUM" },
|
| 151 |
+
"evaluations.edtsum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "EDTSUM" },
|
| 152 |
+
"evaluations.ccf": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "CCF" },
|
| 153 |
+
"evaluations.bigdata22": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "BigData22" },
|
| 154 |
+
"evaluations.msft": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "MSFT" },
|
| 155 |
+
"evaluations.rre": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "RRE" },
|
| 156 |
+
"evaluations.aie": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "AIE" },
|
| 157 |
+
"evaluations.lne": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "LNE" },
|
| 158 |
+
"evaluations.financialiq": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinancialIQ" },
|
| 159 |
+
"evaluations.chabsa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "chabsa" },
|
| 160 |
+
"evaluations.multifin": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "MultiFin" },
|
| 161 |
+
"evaluations.tsa_es": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TSA (ES)" },
|
| 162 |
+
"evaluations.efpa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "EFPA" },
|
| 163 |
+
"evaluations.fns_2023": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FNS-2023" },
|
| 164 |
+
"evaluations.grfinnum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRFinNUM" },
|
| 165 |
+
"evaluations.grfinqa": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRFinQA" },
|
| 166 |
+
"evaluations.grfns_2023": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRFNS-2023" },
|
| 167 |
+
"evaluations.grmultifin": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GRMultiFin" },
|
| 168 |
+
"evaluations.dolfin": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "DOLFIN" },
|
| 169 |
+
"evaluations.polyfiqa_easy": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "PolyFiQA-Easy" },
|
| 170 |
+
"evaluations.polyfiqa_expert": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "PolyFiQA-Expert" },
|
| 171 |
+
"evaluations.text_average": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Text Avg" },
|
| 172 |
+
"evaluations.englishocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "EnglishOCR" },
|
| 173 |
+
"evaluations.tablebench": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "TableBench" },
|
| 174 |
+
"evaluations.japaneseocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "JapaneseOCR" },
|
| 175 |
+
"evaluations.spanishocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "SpanishOCR" },
|
| 176 |
+
"evaluations.greekocr": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "GreekOCR" },
|
| 177 |
+
"evaluations.vision_average": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Vision Avg" },
|
| 178 |
+
"evaluations.mdrm_test": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "MDRM-test" },
|
| 179 |
+
"evaluations.finaudiosum": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "FinAudioSum" },
|
| 180 |
+
"evaluations.audio_average": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Audio Avg" },
|
| 181 |
+
"evaluations.modality_balanced": { group: "evaluation", size: COLUMN_SIZES.BENCHMARK, defaultVisible: true, label: "Modal. Avg" },
|
| 182 |
},
|
| 183 |
MODEL_INFO: {
|
| 184 |
"metadata.co2_cost": {
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useDataUtils.js
CHANGED
|
@@ -6,89 +6,212 @@ import {
|
|
| 6 |
} from "../utils/searchUtils";
|
| 7 |
import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
|
| 8 |
|
| 9 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
const HARDCODED_SCORES = {
|
| 11 |
-
|
| 12 |
-
"GPT-4o":
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 19 |
-
},
|
| 20 |
-
|
| 21 |
-
"GPT-4o":
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
"Qwen2-Audio-7B-Instruct":
|
| 28 |
-
},
|
| 29 |
-
|
| 30 |
-
"GPT-4o":
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 37 |
-
},
|
| 38 |
-
|
| 39 |
-
"GPT-4o":
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 46 |
-
},
|
| 47 |
-
|
| 48 |
-
"GPT-4o":
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 55 |
-
},
|
| 56 |
-
|
| 57 |
-
"GPT-4o":
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 64 |
-
},
|
| 65 |
-
|
| 66 |
-
"GPT-4o":
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 73 |
-
},
|
| 74 |
-
|
| 75 |
-
"GPT-4o":
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 82 |
-
},
|
| 83 |
-
|
| 84 |
-
"GPT-4o":
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview":
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
};
|
| 93 |
|
| 94 |
// Calculate min/max averages
|
|
@@ -127,41 +250,24 @@ export const useProcessedData = (data, averageMode, visibleColumns) => {
|
|
| 127 |
// 直接使用硬编码数据创建模型列表
|
| 128 |
const modelList = [];
|
| 129 |
|
| 130 |
-
//
|
| 131 |
-
const modelNames =
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
// 创建硬编码评估数据
|
| 142 |
-
const hardcodedEvaluations = {
|
| 143 |
-
vision_average: getHardcodedScore(modelName, 'vision'),
|
| 144 |
-
audio_average: getHardcodedScore(modelName, 'audio'),
|
| 145 |
-
english_average: getHardcodedScore(modelName, 'english'),
|
| 146 |
-
chinese_average: getHardcodedScore(modelName, 'chinese'),
|
| 147 |
-
japanese_average: getHardcodedScore(modelName, 'japanese'),
|
| 148 |
-
spanish_average: getHardcodedScore(modelName, 'spanish'),
|
| 149 |
-
greek_average: getHardcodedScore(modelName, 'greek'),
|
| 150 |
-
bilingual_average: getHardcodedScore(modelName, 'bilingual'),
|
| 151 |
-
multilingual_average: getHardcodedScore(modelName, 'multilingual')
|
| 152 |
-
};
|
| 153 |
-
|
| 154 |
-
// 计算总平均分(包含分数为0的类别)
|
| 155 |
-
const scores = Object.values(hardcodedEvaluations).filter(score => score !== null);
|
| 156 |
-
const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
|
| 157 |
-
|
| 158 |
-
// 创建模型数据
|
| 159 |
modelList.push({
|
| 160 |
id: `model-${index}`,
|
| 161 |
model: {
|
| 162 |
name: modelName,
|
| 163 |
average_score: averageScore,
|
| 164 |
-
type: "chat",
|
|
|
|
| 165 |
},
|
| 166 |
evaluations: hardcodedEvaluations,
|
| 167 |
features: {
|
|
|
|
| 6 |
} from "../utils/searchUtils";
|
| 7 |
import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
|
| 8 |
|
| 9 |
+
// MultiFinBen Table 3 hardcoded scores
|
| 10 |
+
const MODEL_TYPES_MAP = {
|
| 11 |
+
"GPT-4o": "chat", "o3-Mini": "chat", "Deepseek-V3": "chat",
|
| 12 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "chat",
|
| 13 |
+
"meta-llama/Llama-3.1-70B-Instruct": "chat",
|
| 14 |
+
"google/gemma-3-4b-it": "chat", "google/gemma-3-27b-it": "chat",
|
| 15 |
+
"Qwen/Qwen2.5-32B-Instruct": "chat", "Qwen/Qwen2.5-Omni-7B": "multimodal",
|
| 16 |
+
"TheFinAI/finma-7b-full": "fine-tuned",
|
| 17 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": "chat",
|
| 18 |
+
"cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": "fine-tuned",
|
| 19 |
+
"TheFinAI/FinMA-ES-Bilingual": "fine-tuned", "TheFinAI/plutus-8B-instruct": "fine-tuned",
|
| 20 |
+
"LLaVA-1.6 Vicuna-13B": "multimodal", "Deepseek-VL-7B-Chat": "multimodal",
|
| 21 |
+
"Whisper-V3": "multimodal", "Qwen2-Audio-7B": "multimodal",
|
| 22 |
+
"Qwen2-Audio-7B-Instruct": "multimodal", "SALMONN-7B": "multimodal", "SALMONN-13B": "multimodal"
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
// Openness classifications per Model Openness Framework (isitopen.ai)
|
| 26 |
+
// GPT-4o and o3-Mini are closed proprietary models (API only, no weights released)
|
| 27 |
+
// All other models have publicly released weights → Class III - Open Model
|
| 28 |
+
const MODEL_OPENNESS_MAP = {
|
| 29 |
+
"GPT-4o": "Closed", "o3-Mini": "Closed",
|
| 30 |
+
"Deepseek-V3": "Class III - Open Model",
|
| 31 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Class III - Open Model",
|
| 32 |
+
"meta-llama/Llama-3.1-70B-Instruct": "Class III - Open Model",
|
| 33 |
+
"google/gemma-3-4b-it": "Class III - Open Model",
|
| 34 |
+
"google/gemma-3-27b-it": "Class III - Open Model",
|
| 35 |
+
"Qwen/Qwen2.5-32B-Instruct": "Class III - Open Model",
|
| 36 |
+
"Qwen/Qwen2.5-Omni-7B": "Class III - Open Model",
|
| 37 |
+
"TheFinAI/finma-7b-full": "Class III - Open Model",
|
| 38 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": "Class III - Open Model",
|
| 39 |
+
"cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": "Class III - Open Model",
|
| 40 |
+
"TheFinAI/FinMA-ES-Bilingual": "Class III - Open Model",
|
| 41 |
+
"TheFinAI/plutus-8B-instruct": "Class III - Open Model",
|
| 42 |
+
"LLaVA-1.6 Vicuna-13B": "Class III - Open Model",
|
| 43 |
+
"Deepseek-VL-7B-Chat": "Class III - Open Model",
|
| 44 |
+
"Whisper-V3": "Class III - Open Model",
|
| 45 |
+
"Qwen2-Audio-7B": "Class III - Open Model",
|
| 46 |
+
"Qwen2-Audio-7B-Instruct": "Class III - Open Model",
|
| 47 |
+
"SALMONN-7B": "Class III - Open Model",
|
| 48 |
+
"SALMONN-13B": "Class III - Open Model",
|
| 49 |
+
};
|
| 50 |
+
|
| 51 |
+
const MODEL_NAMES = [
|
| 52 |
+
"GPT-4o", "o3-Mini", "Deepseek-V3",
|
| 53 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
| 54 |
+
"meta-llama/Llama-3.1-70B-Instruct",
|
| 55 |
+
"google/gemma-3-4b-it", "google/gemma-3-27b-it",
|
| 56 |
+
"Qwen/Qwen2.5-32B-Instruct", "Qwen/Qwen2.5-Omni-7B",
|
| 57 |
+
"TheFinAI/finma-7b-full",
|
| 58 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview",
|
| 59 |
+
"cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese",
|
| 60 |
+
"TheFinAI/FinMA-ES-Bilingual", "TheFinAI/plutus-8B-instruct",
|
| 61 |
+
"LLaVA-1.6 Vicuna-13B", "Deepseek-VL-7B-Chat",
|
| 62 |
+
"Whisper-V3", "Qwen2-Audio-7B",
|
| 63 |
+
"Qwen2-Audio-7B-Instruct", "SALMONN-7B", "SALMONN-13B"
|
| 64 |
+
];
|
| 65 |
+
|
| 66 |
+
// Text EN, ZH, JA, ES, EL, BI, MU datasets + averages; Vision + Audio datasets + averages; Modality-Balanced
|
| 67 |
+
export const DATASET_KEYS = [
|
| 68 |
+
// Text - English
|
| 69 |
+
'sc', 'finred', 'finer_ord', 'headlines', 'tsa_en', 'xbrl_math',
|
| 70 |
+
'finqa', 'tatqa', 'ectsum', 'edtsum', 'ccf', 'bigdata22', 'msft',
|
| 71 |
+
// Text - Chinese
|
| 72 |
+
'rre', 'aie', 'lne', 'financialiq',
|
| 73 |
+
// Text - Japanese
|
| 74 |
+
'chabsa',
|
| 75 |
+
// Text - Spanish
|
| 76 |
+
'multifin', 'tsa_es', 'efpa', 'fns_2023',
|
| 77 |
+
// Text - Greek
|
| 78 |
+
'grfinnum', 'grfinqa', 'grfns_2023', 'grmultifin',
|
| 79 |
+
// Text - Bilingual
|
| 80 |
+
'dolfin',
|
| 81 |
+
// Text - Multilingual
|
| 82 |
+
'polyfiqa_easy', 'polyfiqa_expert',
|
| 83 |
+
// Text Average
|
| 84 |
+
'text_average',
|
| 85 |
+
// Vision
|
| 86 |
+
'englishocr', 'tablebench', 'japaneseocr', 'spanishocr', 'greekocr',
|
| 87 |
+
'vision_average',
|
| 88 |
+
// Audio
|
| 89 |
+
'mdrm_test', 'finaudiosum', 'audio_average',
|
| 90 |
+
// Modality-Balanced Average
|
| 91 |
+
'modality_balanced',
|
| 92 |
+
];
|
| 93 |
+
|
| 94 |
const HARDCODED_SCORES = {
|
| 95 |
+
sc: {
|
| 96 |
+
"GPT-4o": 88.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 20.73, "meta-llama/Llama-3.1-70B-Instruct": 87.00, "google/gemma-3-4b-it": 0.69, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 22.28, "Qwen/Qwen2.5-Omni-7B": 18.61, "TheFinAI/finma-7b-full": 56.62, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 24.09, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 15.92, "TheFinAI/FinMA-ES-Bilingual": 52.70, "TheFinAI/plutus-8B-instruct": 19.79, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 97 |
+
},
|
| 98 |
+
finred: {
|
| 99 |
+
"GPT-4o": 3.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.37, "Qwen/Qwen2.5-Omni-7B": 0.09, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.75, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 100 |
+
},
|
| 101 |
+
finer_ord: {
|
| 102 |
+
"GPT-4o": 78.00, "o3-Mini": 9.58, "Deepseek-V3": 0.18, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 2.23, "meta-llama/Llama-3.1-70B-Instruct": 18.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 28.30, "Qwen/Qwen2.5-Omni-7B": 8.30, "TheFinAI/finma-7b-full": 0.04, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 5.35, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 103 |
+
},
|
| 104 |
+
headlines: {
|
| 105 |
+
"GPT-4o": 78.00, "o3-Mini": 0.00, "Deepseek-V3": 47.32, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 84.33, "meta-llama/Llama-3.1-70B-Instruct": 60.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 85.42, "Qwen/Qwen2.5-Omni-7B": 82.23, "TheFinAI/finma-7b-full": 97.08, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 85.10, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 82.21, "TheFinAI/FinMA-ES-Bilingual": 94.69, "TheFinAI/plutus-8B-instruct": 71.14, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 106 |
+
},
|
| 107 |
+
tsa_en: {
|
| 108 |
+
"GPT-4o": 61.00, "o3-Mini": 0.00, "Deepseek-V3": 0.85, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 63.82, "meta-llama/Llama-3.1-70B-Instruct": 42.00, "google/gemma-3-4b-it": 32.34, "google/gemma-3-27b-it": 32.34, "Qwen/Qwen2.5-32B-Instruct": 42.98, "Qwen/Qwen2.5-Omni-7B": 43.40, "TheFinAI/finma-7b-full": 81.70, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 85.11, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 60.00, "TheFinAI/FinMA-ES-Bilingual": 86.38, "TheFinAI/plutus-8B-instruct": 54.89, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 109 |
+
},
|
| 110 |
+
xbrl_math: {
|
| 111 |
+
"GPT-4o": 68.00, "o3-Mini": 68.89, "Deepseek-V3": 67.78, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 27.78, "meta-llama/Llama-3.1-70B-Instruct": 62.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 11.11, "Qwen/Qwen2.5-32B-Instruct": 64.44, "Qwen/Qwen2.5-Omni-7B": 44.44, "TheFinAI/finma-7b-full": 7.78, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 3.33, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 2.22, "TheFinAI/FinMA-ES-Bilingual": 6.67, "TheFinAI/plutus-8B-instruct": 14.44, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 112 |
+
},
|
| 113 |
+
finqa: {
|
| 114 |
+
"GPT-4o": 5.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 7.41, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 1.22, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 115 |
+
},
|
| 116 |
+
tatqa: {
|
| 117 |
+
"GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.36, "meta-llama/Llama-3.1-70B-Instruct": 44.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.05, "Qwen/Qwen2.5-Omni-7B": 1.73, "TheFinAI/finma-7b-full": 4.14, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 15.16, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 118 |
+
},
|
| 119 |
+
ectsum: {
|
| 120 |
+
"GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 121 |
+
},
|
| 122 |
+
edtsum: {
|
| 123 |
+
"GPT-4o": 25.00, "o3-Mini": 19.13, "Deepseek-V3": 16.80, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.59, "meta-llama/Llama-3.1-70B-Instruct": 18.00, "google/gemma-3-4b-it": 0.98, "google/gemma-3-27b-it": 0.10, "Qwen/Qwen2.5-32B-Instruct": 20.16, "Qwen/Qwen2.5-Omni-7B": 23.89, "TheFinAI/finma-7b-full": 19.92, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 12.49, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 8.06, "TheFinAI/FinMA-ES-Bilingual": 2.06, "TheFinAI/plutus-8B-instruct": 13.61, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 124 |
+
},
|
| 125 |
+
ccf: {
|
| 126 |
+
"GPT-4o": 52.50, "o3-Mini": 50.00, "Deepseek-V3": 50.62, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 51.34, "meta-llama/Llama-3.1-70B-Instruct": 50.00, "google/gemma-3-4b-it": 50.93, "google/gemma-3-27b-it": 50.00, "Qwen/Qwen2.5-32B-Instruct": 52.94, "Qwen/Qwen2.5-Omni-7B": 50.31, "TheFinAI/finma-7b-full": 50.05, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 50.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 50.00, "TheFinAI/FinMA-ES-Bilingual": 51.18, "TheFinAI/plutus-8B-instruct": 50.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 127 |
+
},
|
| 128 |
+
bigdata22: {
|
| 129 |
+
"GPT-4o": 48.50, "o3-Mini": 50.00, "Deepseek-V3": 50.93, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 46.91, "meta-llama/Llama-3.1-70B-Instruct": 50.00, "google/gemma-3-4b-it": 50.75, "google/gemma-3-27b-it": 50.00, "Qwen/Qwen2.5-32B-Instruct": 49.89, "Qwen/Qwen2.5-Omni-7B": 51.82, "TheFinAI/finma-7b-full": 50.80, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 53.12, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 50.00, "TheFinAI/FinMA-ES-Bilingual": 52.12, "TheFinAI/plutus-8B-instruct": 50.26, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 130 |
+
},
|
| 131 |
+
msft: {
|
| 132 |
+
"GPT-4o": 41.32, "o3-Mini": 65.06, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 72.25, "google/gemma-3-4b-it": 74.03, "google/gemma-3-27b-it": 79.97, "Qwen/Qwen2.5-32B-Instruct": 49.32, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 68.81, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 74.50, "TheFinAI/FinMA-ES-Bilingual": 66.53, "TheFinAI/plutus-8B-instruct": 65.10, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 133 |
+
},
|
| 134 |
+
rre: {
|
| 135 |
+
"GPT-4o": 63.25, "o3-Mini": 0.00, "Deepseek-V3": 67.52, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 54.70, "meta-llama/Llama-3.1-70B-Instruct": 46.15, "google/gemma-3-4b-it": 36.75, "google/gemma-3-27b-it": 36.75, "Qwen/Qwen2.5-32B-Instruct": 8.55, "Qwen/Qwen2.5-Omni-7B": 7.69, "TheFinAI/finma-7b-full": 0.85, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 2.56, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.85, "TheFinAI/FinMA-ES-Bilingual": 0.85, "TheFinAI/plutus-8B-instruct": 2.56, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 136 |
+
},
|
| 137 |
+
aie: {
|
| 138 |
+
"GPT-4o": 82.26, "o3-Mini": 0.00, "Deepseek-V3": 82.01, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 80.99, "meta-llama/Llama-3.1-70B-Instruct": 76.80, "google/gemma-3-4b-it": 33.82, "google/gemma-3-27b-it": 33.82, "Qwen/Qwen2.5-32B-Instruct": 83.03, "Qwen/Qwen2.5-Omni-7B": 80.17, "TheFinAI/finma-7b-full": 40.81, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 10.04, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 4.32, "TheFinAI/FinMA-ES-Bilingual": 21.55, "TheFinAI/plutus-8B-instruct": 54.48, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 139 |
+
},
|
| 140 |
+
lne: {
|
| 141 |
+
"GPT-4o": 63.30, "o3-Mini": 0.00, "Deepseek-V3": 58.72, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 55.50, "meta-llama/Llama-3.1-70B-Instruct": 41.28, "google/gemma-3-4b-it": 9.17, "google/gemma-3-27b-it": 9.17, "Qwen/Qwen2.5-32B-Instruct": 57.80, "Qwen/Qwen2.5-Omni-7B": 59.17, "TheFinAI/finma-7b-full": 29.82, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 22.48, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 12.84, "TheFinAI/FinMA-ES-Bilingual": 32.11, "TheFinAI/plutus-8B-instruct": 26.61, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 142 |
+
},
|
| 143 |
+
financialiq: {
|
| 144 |
+
"GPT-4o": 32.53, "o3-Mini": 0.00, "Deepseek-V3": 35.52, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 66.83, "meta-llama/Llama-3.1-70B-Instruct": 62.71, "google/gemma-3-4b-it": 25.19, "google/gemma-3-27b-it": 25.20, "Qwen/Qwen2.5-32B-Instruct": 77.09, "Qwen/Qwen2.5-Omni-7B": 65.32, "TheFinAI/finma-7b-full": 26.21, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 57.07, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 34.70, "TheFinAI/FinMA-ES-Bilingual": 31.48, "TheFinAI/plutus-8B-instruct": 40.52, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 145 |
+
},
|
| 146 |
+
chabsa: {
|
| 147 |
+
"GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.43, "meta-llama/Llama-3.1-70B-Instruct": 32.17, "google/gemma-3-4b-it": 8.98, "google/gemma-3-27b-it": 23.96, "Qwen/Qwen2.5-32B-Instruct": 4.54, "Qwen/Qwen2.5-Omni-7B": 44.35, "TheFinAI/finma-7b-full": 46.94, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 47.59, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 23.96, "TheFinAI/FinMA-ES-Bilingual": 57.36, "TheFinAI/plutus-8B-instruct": 34.62, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 148 |
+
},
|
| 149 |
+
multifin: {
|
| 150 |
+
"GPT-4o": 61.74, "o3-Mini": 0.00, "Deepseek-V3": 53.91, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 62.17, "meta-llama/Llama-3.1-70B-Instruct": 48.26, "google/gemma-3-4b-it": 22.17, "google/gemma-3-27b-it": 22.17, "Qwen/Qwen2.5-32B-Instruct": 46.52, "Qwen/Qwen2.5-Omni-7B": 46.96, "TheFinAI/finma-7b-full": 43.04, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 31.74, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 12.61, "TheFinAI/FinMA-ES-Bilingual": 44.78, "TheFinAI/plutus-8B-instruct": 51.30, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 151 |
+
},
|
| 152 |
+
tsa_es: {
|
| 153 |
+
"GPT-4o": 0.39, "o3-Mini": 0.00, "Deepseek-V3": 29.17, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 52.29, "meta-llama/Llama-3.1-70B-Instruct": 24.29, "google/gemma-3-4b-it": 63.04, "google/gemma-3-27b-it": 63.46, "Qwen/Qwen2.5-32B-Instruct": 31.63, "Qwen/Qwen2.5-Omni-7B": 46.46, "TheFinAI/finma-7b-full": 31.03, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 68.19, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 63.38, "TheFinAI/FinMA-ES-Bilingual": 16.64, "TheFinAI/plutus-8B-instruct": 51.82, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 154 |
+
},
|
| 155 |
+
efpa: {
|
| 156 |
+
"GPT-4o": 31.14, "o3-Mini": 0.00, "Deepseek-V3": 18.86, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 67.54, "meta-llama/Llama-3.1-70B-Instruct": 66.67, "google/gemma-3-4b-it": 25.44, "google/gemma-3-27b-it": 25.44, "Qwen/Qwen2.5-32B-Instruct": 65.79, "Qwen/Qwen2.5-Omni-7B": 55.70, "TheFinAI/finma-7b-full": 32.46, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 65.79, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 25.44, "TheFinAI/FinMA-ES-Bilingual": 91.67, "TheFinAI/plutus-8B-instruct": 48.25, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 157 |
+
},
|
| 158 |
+
fns_2023: {
|
| 159 |
+
"GPT-4o": 25.94, "o3-Mini": 18.11, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 9.61, "meta-llama/Llama-3.1-70B-Instruct": 12.14, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 5.93, "Qwen/Qwen2.5-Omni-7B": 7.50, "TheFinAI/finma-7b-full": 1.64, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 5.71, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 10.62, "TheFinAI/FinMA-ES-Bilingual": 1.65, "TheFinAI/plutus-8B-instruct": 9.27, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 160 |
+
},
|
| 161 |
+
grfinnum: {
|
| 162 |
+
"GPT-4o": 9.18, "o3-Mini": 20.98, "Deepseek-V3": 7.43, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 49.12, "meta-llama/Llama-3.1-70B-Instruct": 46.34, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 36.77, "Qwen/Qwen2.5-Omni-7B": 0.40, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 70.06, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 163 |
+
},
|
| 164 |
+
grfinqa: {
|
| 165 |
+
"GPT-4o": 78.22, "o3-Mini": 0.00, "Deepseek-V3": 50.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 74.22, "meta-llama/Llama-3.1-70B-Instruct": 64.44, "google/gemma-3-4b-it": 22.67, "google/gemma-3-27b-it": 22.67, "Qwen/Qwen2.5-32B-Instruct": 60.44, "Qwen/Qwen2.5-Omni-7B": 48.89, "TheFinAI/finma-7b-full": 25.33, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 57.78, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 28.44, "TheFinAI/FinMA-ES-Bilingual": 23.11, "TheFinAI/plutus-8B-instruct": 64.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 166 |
+
},
|
| 167 |
+
grfns_2023: {
|
| 168 |
+
"GPT-4o": 25.50, "o3-Mini": 16.95, "Deepseek-V3": 37.72, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.90, "meta-llama/Llama-3.1-70B-Instruct": 13.61, "google/gemma-3-4b-it": 0.24, "google/gemma-3-27b-it": 0.21, "Qwen/Qwen2.5-32B-Instruct": 9.71, "Qwen/Qwen2.5-Omni-7B": 5.60, "TheFinAI/finma-7b-full": 11.20, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 6.48, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 14.45, "TheFinAI/FinMA-ES-Bilingual": 3.56, "TheFinAI/plutus-8B-instruct": 34.46, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 169 |
+
},
|
| 170 |
+
grmultifin: {
|
| 171 |
+
"GPT-4o": 59.26, "o3-Mini": 0.00, "Deepseek-V3": 61.11, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 55.56, "meta-llama/Llama-3.1-70B-Instruct": 50.00, "google/gemma-3-4b-it": 38.89, "google/gemma-3-27b-it": 38.89, "Qwen/Qwen2.5-32B-Instruct": 70.37, "Qwen/Qwen2.5-Omni-7B": 38.89, "TheFinAI/finma-7b-full": 35.19, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 53.70, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 40.74, "TheFinAI/FinMA-ES-Bilingual": 35.19, "TheFinAI/plutus-8B-instruct": 72.22, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 172 |
+
},
|
| 173 |
+
dolfin: {
|
| 174 |
+
"GPT-4o": 92.29, "o3-Mini": 90.13, "Deepseek-V3": 86.26, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 89.17, "meta-llama/Llama-3.1-70B-Instruct": 92.13, "google/gemma-3-4b-it": 35.92, "google/gemma-3-27b-it": 35.92, "Qwen/Qwen2.5-32B-Instruct": 92.29, "Qwen/Qwen2.5-Omni-7B": 91.80, "TheFinAI/finma-7b-full": 69.24, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 91.60, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 71.81, "TheFinAI/FinMA-ES-Bilingual": 66.57, "TheFinAI/plutus-8B-instruct": 91.59, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 175 |
+
},
|
| 176 |
+
polyfiqa_easy: {
|
| 177 |
+
"GPT-4o": 9.79, "o3-Mini": 9.56, "Deepseek-V3": 34.72, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 27.73, "meta-llama/Llama-3.1-70B-Instruct": 25.04, "google/gemma-3-4b-it": 15.02, "google/gemma-3-27b-it": 14.74, "Qwen/Qwen2.5-32B-Instruct": 19.34, "Qwen/Qwen2.5-Omni-7B": 18.81, "TheFinAI/finma-7b-full": 2.44, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 2.40, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 11.63, "TheFinAI/FinMA-ES-Bilingual": 0.63, "TheFinAI/plutus-8B-instruct": 7.06, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 178 |
+
},
|
| 179 |
+
polyfiqa_expert: {
|
| 180 |
+
"GPT-4o": 5.31, "o3-Mini": 4.85, "Deepseek-V3": 30.35, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 20.60, "meta-llama/Llama-3.1-70B-Instruct": 18.56, "google/gemma-3-4b-it": 13.83, "google/gemma-3-27b-it": 16.01, "Qwen/Qwen2.5-32B-Instruct": 18.17, "Qwen/Qwen2.5-Omni-7B": 16.35, "TheFinAI/finma-7b-full": 6.38, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.71, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 8.80, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 9.87, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 181 |
+
},
|
| 182 |
+
text_average: {
|
| 183 |
+
"GPT-4o": 40.98, "o3-Mini": 14.59, "Deepseek-V3": 30.61, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 39.50, "meta-llama/Llama-3.1-70B-Instruct": 42.20, "google/gemma-3-4b-it": 19.34, "google/gemma-3-27b-it": 20.41, "Qwen/Qwen2.5-32B-Instruct": 38.07, "Qwen/Qwen2.5-Omni-7B": 33.06, "TheFinAI/finma-7b-full": 26.83, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 31.24, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 24.40, "TheFinAI/FinMA-ES-Bilingual": 28.95, "TheFinAI/plutus-8B-instruct": 35.53, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 184 |
+
},
|
| 185 |
+
englishocr: {
|
| 186 |
+
"GPT-4o": 21.38, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 12.39, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 10.70, "google/gemma-3-27b-it": 11.40, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 187 |
+
},
|
| 188 |
+
tablebench: {
|
| 189 |
+
"GPT-4o": 66.70, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 32.30, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 28.60, "google/gemma-3-27b-it": 60.90, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 74.90, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 59.30, "Deepseek-VL-7B-Chat": 57.30, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 190 |
+
},
|
| 191 |
+
japaneseocr: {
|
| 192 |
+
"GPT-4o": 21.63, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 24.52, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 25.82, "google/gemma-3-27b-it": 26.72, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 21.59, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 9.70, "TheFinAI/plutus-8B-instruct": 6.62, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 193 |
+
},
|
| 194 |
+
spanishocr: {
|
| 195 |
+
"GPT-4o": 78.55, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 4.12, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 5.60, "google/gemma-3-27b-it": 4.40, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 196 |
+
},
|
| 197 |
+
greekocr: {
|
| 198 |
+
"GPT-4o": 41.86, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 42.51, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 23.69, "google/gemma-3-27b-it": 30.60, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 8.25, "TheFinAI/plutus-8B-instruct": 6.80, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 199 |
+
},
|
| 200 |
+
vision_average: {
|
| 201 |
+
"GPT-4o": 46.02, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 23.17, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 18.88, "google/gemma-3-27b-it": 26.80, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 19.30, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 15.45, "Deepseek-VL-7B-Chat": 14.14, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 202 |
+
},
|
| 203 |
+
mdrm_test: {
|
| 204 |
+
"GPT-4o": 95.77, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 96.43, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 97.86, "Qwen2-Audio-7B": 96.03, "Qwen2-Audio-7B-Instruct": 95.32, "SALMONN-7B": 48.48, "SALMONN-13B": 49.17
|
| 205 |
+
},
|
| 206 |
+
finaudiosum: {
|
| 207 |
+
"GPT-4o": 6.30, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 0.00, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 5.30, "Qwen2-Audio-7B": 0.00, "Qwen2-Audio-7B-Instruct": 4.80, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
| 208 |
+
},
|
| 209 |
+
audio_average: {
|
| 210 |
+
"GPT-4o": 51.04, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00, "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00, "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 48.22, "TheFinAI/finma-7b-full": 0.00, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00, "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 51.58, "Qwen2-Audio-7B": 48.02, "Qwen2-Audio-7B-Instruct": 50.06, "SALMONN-7B": 24.24, "SALMONN-13B": 24.59
|
| 211 |
+
},
|
| 212 |
+
modality_balanced: {
|
| 213 |
+
"GPT-4o": 46.01, "o3-Mini": 4.86, "Deepseek-V3": 10.20, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 20.89, "meta-llama/Llama-3.1-70B-Instruct": 14.07, "google/gemma-3-4b-it": 12.74, "google/gemma-3-27b-it": 15.74, "Qwen/Qwen2.5-32B-Instruct": 12.69, "Qwen/Qwen2.5-Omni-7B": 33.53, "TheFinAI/finma-7b-full": 8.94, "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 10.41, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 8.13, "TheFinAI/FinMA-ES-Bilingual": 9.65, "TheFinAI/plutus-8B-instruct": 11.84, "LLaVA-1.6 Vicuna-13B": 5.15, "Deepseek-VL-7B-Chat": 4.71, "Whisper-V3": 17.19, "Qwen2-Audio-7B": 16.01, "Qwen2-Audio-7B-Instruct": 16.69, "SALMONN-7B": 8.08, "SALMONN-13B": 8.20
|
| 214 |
+
},
|
| 215 |
};
|
| 216 |
|
| 217 |
// Calculate min/max averages
|
|
|
|
| 250 |
// 直接使用硬编码数据创建模型列表
|
| 251 |
const modelList = [];
|
| 252 |
|
| 253 |
+
// Use modality_balanced as the canonical model list
|
| 254 |
+
const modelNames = Object.keys(HARDCODED_SCORES.modality_balanced);
|
| 255 |
+
|
| 256 |
+
modelNames.forEach((modelName, index) => {
|
| 257 |
+
const hardcodedEvaluations = Object.fromEntries(
|
| 258 |
+
DATASET_KEYS.map(key => [key, getHardcodedScore(modelName, key)])
|
| 259 |
+
);
|
| 260 |
+
|
| 261 |
+
// Use modality-balanced average from Table 3 directly
|
| 262 |
+
const averageScore = getHardcodedScore(modelName, 'modality_balanced');
|
| 263 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
modelList.push({
|
| 265 |
id: `model-${index}`,
|
| 266 |
model: {
|
| 267 |
name: modelName,
|
| 268 |
average_score: averageScore,
|
| 269 |
+
type: MODEL_TYPES_MAP[modelName] || "chat",
|
| 270 |
+
openness: MODEL_OPENNESS_MAP[modelName] || "Class III - Open Model",
|
| 271 |
},
|
| 272 |
evaluations: hardcodedEvaluations,
|
| 273 |
features: {
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js
CHANGED
|
@@ -440,7 +440,7 @@ const BooleanValue = ({ value }) => {
|
|
| 440 |
);
|
| 441 |
};
|
| 442 |
|
| 443 |
-
//
|
| 444 |
const createGreekLeaderboardHeader = (header) => (
|
| 445 |
<Box
|
| 446 |
className="header-content"
|
|
@@ -452,8 +452,8 @@ const createGreekLeaderboardHeader = (header) => (
|
|
| 452 |
}}
|
| 453 |
>
|
| 454 |
<HeaderLabel
|
| 455 |
-
label="Greek
|
| 456 |
-
tooltip="Average performance on Greek financial tasks"
|
| 457 |
className="header-label"
|
| 458 |
isSorted={header?.column?.getIsSorted()}
|
| 459 |
/>
|
|
@@ -467,34 +467,7 @@ const createGreekLeaderboardHeader = (header) => (
|
|
| 467 |
flexShrink: 0,
|
| 468 |
}}
|
| 469 |
>
|
| 470 |
-
<InfoIcon tooltip="Average performance on Greek financial tasks" />
|
| 471 |
-
<Link
|
| 472 |
-
href="https://huggingface.co/spaces/TheFinAI/Open-Greek-Financial-LLM-Leaderboard#/"
|
| 473 |
-
target="_blank"
|
| 474 |
-
rel="noopener noreferrer"
|
| 475 |
-
aria-label="View Greek Financial LLM Leaderboard"
|
| 476 |
-
sx={{
|
| 477 |
-
color: "info.main",
|
| 478 |
-
display: "flex",
|
| 479 |
-
alignItems: "center",
|
| 480 |
-
ml: 0.5,
|
| 481 |
-
textDecoration: "none",
|
| 482 |
-
"&:hover": {
|
| 483 |
-
textDecoration: "underline",
|
| 484 |
-
"& svg": {
|
| 485 |
-
opacity: 0.8,
|
| 486 |
-
},
|
| 487 |
-
},
|
| 488 |
-
}}
|
| 489 |
-
>
|
| 490 |
-
<OpenInNewIcon
|
| 491 |
-
sx={{
|
| 492 |
-
fontSize: "1rem",
|
| 493 |
-
opacity: 0.6,
|
| 494 |
-
transition: "opacity 0.2s ease-in-out",
|
| 495 |
-
}}
|
| 496 |
-
/>
|
| 497 |
-
</Link>
|
| 498 |
</Box>
|
| 499 |
</Box>
|
| 500 |
);
|
|
@@ -511,7 +484,7 @@ const createLeaderboardHeader = (label, tooltip, linkUrl) => (header) => (
|
|
| 511 |
}}
|
| 512 |
>
|
| 513 |
<HeaderLabel
|
| 514 |
-
label={
|
| 515 |
tooltip={tooltip}
|
| 516 |
className="header-label"
|
| 517 |
isSorted={header?.column?.getIsSorted()}
|
|
@@ -873,6 +846,8 @@ export const createColumns = (
|
|
| 873 |
accessorKey: "model.openness",
|
| 874 |
header: createHeaderCell("Openness", "Model openness classification"),
|
| 875 |
cell: ({ row }) => {
|
|
|
|
|
|
|
| 876 |
return (
|
| 877 |
<Box
|
| 878 |
sx={{
|
|
@@ -881,8 +856,11 @@ export const createColumns = (
|
|
| 881 |
alignItems: "center",
|
| 882 |
}}
|
| 883 |
>
|
| 884 |
-
<Typography
|
| 885 |
-
|
|
|
|
|
|
|
|
|
|
| 886 |
</Typography>
|
| 887 |
</Box>
|
| 888 |
);
|
|
@@ -1011,162 +989,63 @@ export const createColumns = (
|
|
| 1011 |
);
|
| 1012 |
};
|
| 1013 |
|
| 1014 |
-
const
|
| 1015 |
-
{
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
},
|
| 1032 |
-
{
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
},
|
| 1048 |
-
},
|
| 1049 |
-
{
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
|
| 1055 |
-
headerStyle: {
|
| 1056 |
-
backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
|
| 1057 |
-
},
|
| 1058 |
-
cellStyle: (value) => ({
|
| 1059 |
-
position: "relative",
|
| 1060 |
-
overflow: "hidden",
|
| 1061 |
-
padding: "8px 16px",
|
| 1062 |
-
backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
|
| 1063 |
-
}),
|
| 1064 |
-
},
|
| 1065 |
-
},
|
| 1066 |
-
{
|
| 1067 |
-
accessorKey: "evaluations.english_average",
|
| 1068 |
-
header: createLeaderboardHeader("English", "Average performance on English language tasks", null),
|
| 1069 |
-
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.english_average"),
|
| 1070 |
-
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1071 |
-
meta: {
|
| 1072 |
-
headerStyle: {
|
| 1073 |
-
backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
|
| 1074 |
-
},
|
| 1075 |
-
cellStyle: (value) => ({
|
| 1076 |
-
position: "relative",
|
| 1077 |
-
overflow: "hidden",
|
| 1078 |
-
padding: "8px 16px",
|
| 1079 |
-
backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
|
| 1080 |
-
}),
|
| 1081 |
-
},
|
| 1082 |
-
},
|
| 1083 |
-
{
|
| 1084 |
-
accessorKey: "evaluations.chinese_average",
|
| 1085 |
-
header: createLeaderboardHeader("Chinese", "Average performance on Chinese language tasks", null),
|
| 1086 |
-
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.chinese_average"),
|
| 1087 |
-
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1088 |
-
meta: {
|
| 1089 |
-
headerStyle: {
|
| 1090 |
-
backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
|
| 1091 |
-
},
|
| 1092 |
-
cellStyle: (value) => ({
|
| 1093 |
-
position: "relative",
|
| 1094 |
-
overflow: "hidden",
|
| 1095 |
-
padding: "8px 16px",
|
| 1096 |
-
backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
|
| 1097 |
-
}),
|
| 1098 |
-
},
|
| 1099 |
-
},
|
| 1100 |
-
{
|
| 1101 |
-
accessorKey: "evaluations.japanese_average",
|
| 1102 |
-
header: createLeaderboardHeader("Japanese", "Average performance on Japanese language tasks", null),
|
| 1103 |
-
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.japanese_average"),
|
| 1104 |
-
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1105 |
-
meta: {
|
| 1106 |
-
headerStyle: {
|
| 1107 |
-
backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
|
| 1108 |
-
},
|
| 1109 |
-
cellStyle: (value) => ({
|
| 1110 |
-
position: "relative",
|
| 1111 |
-
overflow: "hidden",
|
| 1112 |
-
padding: "8px 16px",
|
| 1113 |
-
backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
|
| 1114 |
-
}),
|
| 1115 |
-
},
|
| 1116 |
-
},
|
| 1117 |
-
{
|
| 1118 |
-
accessorKey: "evaluations.spanish_average",
|
| 1119 |
-
header: createLeaderboardHeader("Spanish", "Average performance on Spanish language tasks", null),
|
| 1120 |
-
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.spanish_average"),
|
| 1121 |
-
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1122 |
-
meta: {
|
| 1123 |
-
headerStyle: {
|
| 1124 |
-
backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
|
| 1125 |
-
},
|
| 1126 |
-
cellStyle: (value) => ({
|
| 1127 |
-
position: "relative",
|
| 1128 |
-
overflow: "hidden",
|
| 1129 |
-
padding: "8px 16px",
|
| 1130 |
-
backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
|
| 1131 |
-
}),
|
| 1132 |
-
},
|
| 1133 |
-
},
|
| 1134 |
-
{
|
| 1135 |
-
accessorKey: "evaluations.bilingual_average",
|
| 1136 |
-
header: createLeaderboardHeader("Bilingual", "Average performance on bilingual tasks", null),
|
| 1137 |
-
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.bilingual_average"),
|
| 1138 |
-
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1139 |
-
meta: {
|
| 1140 |
-
headerStyle: {
|
| 1141 |
-
backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
|
| 1142 |
-
},
|
| 1143 |
-
cellStyle: (value) => ({
|
| 1144 |
-
position: "relative",
|
| 1145 |
-
overflow: "hidden",
|
| 1146 |
-
padding: "8px 16px",
|
| 1147 |
-
backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
|
| 1148 |
-
}),
|
| 1149 |
-
},
|
| 1150 |
-
},
|
| 1151 |
-
{
|
| 1152 |
-
accessorKey: "evaluations.multilingual_average",
|
| 1153 |
-
header: createLeaderboardHeader("Multilingual", "Average performance on multilingual tasks", null),
|
| 1154 |
-
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.multilingual_average"),
|
| 1155 |
-
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1156 |
-
meta: {
|
| 1157 |
-
headerStyle: {
|
| 1158 |
-
backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
|
| 1159 |
-
},
|
| 1160 |
-
cellStyle: (value) => ({
|
| 1161 |
-
position: "relative",
|
| 1162 |
-
overflow: "hidden",
|
| 1163 |
-
padding: "8px 16px",
|
| 1164 |
-
backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
|
| 1165 |
-
}),
|
| 1166 |
-
},
|
| 1167 |
-
}
|
| 1168 |
];
|
| 1169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1170 |
const optionalColumns = [
|
| 1171 |
{
|
| 1172 |
accessorKey: "model.architecture",
|
|
|
|
| 440 |
);
|
| 441 |
};
|
| 442 |
|
| 443 |
+
// Greek (EL) language text tasks header
|
| 444 |
const createGreekLeaderboardHeader = (header) => (
|
| 445 |
<Box
|
| 446 |
className="header-content"
|
|
|
|
| 452 |
}}
|
| 453 |
>
|
| 454 |
<HeaderLabel
|
| 455 |
+
label="Greek (EL)"
|
| 456 |
+
tooltip="Average performance on Greek language financial tasks (GRFinNUM, GRFinQA, GRFNS-2023, GRMultiFin)"
|
| 457 |
className="header-label"
|
| 458 |
isSorted={header?.column?.getIsSorted()}
|
| 459 |
/>
|
|
|
|
| 467 |
flexShrink: 0,
|
| 468 |
}}
|
| 469 |
>
|
| 470 |
+
<InfoIcon tooltip="Average performance on Greek language financial tasks (GRFinNUM, GRFinQA, GRFNS-2023, GRMultiFin)" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
</Box>
|
| 472 |
</Box>
|
| 473 |
);
|
|
|
|
| 484 |
}}
|
| 485 |
>
|
| 486 |
<HeaderLabel
|
| 487 |
+
label={label}
|
| 488 |
tooltip={tooltip}
|
| 489 |
className="header-label"
|
| 490 |
isSorted={header?.column?.getIsSorted()}
|
|
|
|
| 846 |
accessorKey: "model.openness",
|
| 847 |
header: createHeaderCell("Openness", "Model openness classification"),
|
| 848 |
cell: ({ row }) => {
|
| 849 |
+
const openness = row.original.model.openness || "Class III - Open Model";
|
| 850 |
+
const isClosed = openness === "Closed";
|
| 851 |
return (
|
| 852 |
<Box
|
| 853 |
sx={{
|
|
|
|
| 856 |
alignItems: "center",
|
| 857 |
}}
|
| 858 |
>
|
| 859 |
+
<Typography
|
| 860 |
+
variant="body2"
|
| 861 |
+
sx={isClosed ? { color: "text.secondary", fontStyle: "italic" } : {}}
|
| 862 |
+
>
|
| 863 |
+
{openness}
|
| 864 |
</Typography>
|
| 865 |
</Box>
|
| 866 |
);
|
|
|
|
| 989 |
);
|
| 990 |
};
|
| 991 |
|
| 992 |
+
const EVAL_COLUMN_DEFS = [
|
| 993 |
+
{ key: 'sc', label: 'SC', tooltip: 'EN Text: SC (Information Extraction)' },
|
| 994 |
+
{ key: 'finred', label: 'FinRED', tooltip: 'EN Text: FinRED (Information Extraction)' },
|
| 995 |
+
{ key: 'finer_ord', label: 'FINER-ORD', tooltip: 'EN Text: FINER-ORD (Information Extraction)' },
|
| 996 |
+
{ key: 'headlines', label: 'Headlines', tooltip: 'EN Text: Headlines (Text Analysis)' },
|
| 997 |
+
{ key: 'tsa_en', label: 'TSA', tooltip: 'EN Text: TSA (Text Analysis)' },
|
| 998 |
+
{ key: 'xbrl_math', label: 'XBRL-Math', tooltip: 'EN Text: XBRL-Math (Question Answering)' },
|
| 999 |
+
{ key: 'finqa', label: 'FinQA', tooltip: 'EN Text: FinQA (Question Answering)' },
|
| 1000 |
+
{ key: 'tatqa', label: 'TATQA', tooltip: 'EN Text: TATQA (Question Answering)' },
|
| 1001 |
+
{ key: 'ectsum', label: 'ECTSUM', tooltip: 'EN Text: ECTSUM (Text Generation)' },
|
| 1002 |
+
{ key: 'edtsum', label: 'EDTSUM', tooltip: 'EN Text: EDTSUM (Text Generation)' },
|
| 1003 |
+
{ key: 'ccf', label: 'CCF', tooltip: 'EN Text: CCF (Ranking Modeling)' },
|
| 1004 |
+
{ key: 'bigdata22', label: 'BigData22', tooltip: 'EN Text: BigData22 (Portfolio Optimization)' },
|
| 1005 |
+
{ key: 'msft', label: 'MSFT', tooltip: 'EN Text: MSFT (Decision Making)' },
|
| 1006 |
+
{ key: 'rre', label: 'RRE', tooltip: 'ZH Text: RRE (Information Extraction)' },
|
| 1007 |
+
{ key: 'aie', label: 'AIE', tooltip: 'ZH Text: AIE (Text Analysis)' },
|
| 1008 |
+
{ key: 'lne', label: 'LNE', tooltip: 'ZH Text: LNE (Text Analysis)' },
|
| 1009 |
+
{ key: 'financialiq', label: 'FinancialIQ', tooltip: 'ZH Text: FinancialIQ (Question Answering)' },
|
| 1010 |
+
{ key: 'chabsa', label: 'chabsa', tooltip: 'JA Text: chabsa (Text Analysis)' },
|
| 1011 |
+
{ key: 'multifin', label: 'MultiFin', tooltip: 'ES Text: MultiFin (Text Analysis)' },
|
| 1012 |
+
{ key: 'tsa_es', label: 'TSA (ES)', tooltip: 'ES Text: TSA (Text Analysis)' },
|
| 1013 |
+
{ key: 'efpa', label: 'EFPA', tooltip: 'ES Text: EFPA (Question Answering)' },
|
| 1014 |
+
{ key: 'fns_2023', label: 'FNS-2023', tooltip: 'ES Text: FNS-2023 (Text Generation)' },
|
| 1015 |
+
{ key: 'grfinnum', label: 'GRFinNUM', tooltip: 'EL Text: GRFinNUM (Information Extraction)' },
|
| 1016 |
+
{ key: 'grfinqa', label: 'GRFinQA', tooltip: 'EL Text: GRFinQA (Question Answering)' },
|
| 1017 |
+
{ key: 'grfns_2023', label: 'GRFNS-2023', tooltip: 'EL Text: GRFNS-2023 (Text Generation)' },
|
| 1018 |
+
{ key: 'grmultifin', label: 'GRMultiFin', tooltip: 'EL Text: GRMultiFin (Text Analysis)' },
|
| 1019 |
+
{ key: 'dolfin', label: 'DOLFIN', tooltip: 'BI Text: DOLFIN (Text Generation)' },
|
| 1020 |
+
{ key: 'polyfiqa_easy', label: 'PolyFiQA-Easy', tooltip: 'MU Text: PolyFiQA-Easy (Question Answering)' },
|
| 1021 |
+
{ key: 'polyfiqa_expert', label: 'PolyFiQA-Expert', tooltip: 'MU Text: PolyFiQA-Expert (Question Answering)' },
|
| 1022 |
+
{ key: 'text_average', label: 'Text Avg', tooltip: 'Average performance on text tasks' },
|
| 1023 |
+
{ key: 'englishocr', label: 'EnglishOCR', tooltip: 'EN Vision: EnglishOCR (Information Extraction)' },
|
| 1024 |
+
{ key: 'tablebench', label: 'TableBench', tooltip: 'EN Vision: TableBench (Question Answering)' },
|
| 1025 |
+
{ key: 'japaneseocr', label: 'JapaneseOCR', tooltip: 'JA Vision: JapaneseOCR (Information Extraction)' },
|
| 1026 |
+
{ key: 'spanishocr', label: 'SpanishOCR', tooltip: 'ES Vision: SpanishOCR (Information Extraction)' },
|
| 1027 |
+
{ key: 'greekocr', label: 'GreekOCR', tooltip: 'EL Vision: GreekOCR (Information Extraction)' },
|
| 1028 |
+
{ key: 'vision_average', label: 'Vision Avg', tooltip: 'Average performance on vision tasks' },
|
| 1029 |
+
{ key: 'mdrm_test', label: 'MDRM-test', tooltip: 'EN Audio: MDRM-test (Text Generation)' },
|
| 1030 |
+
{ key: 'finaudiosum', label: 'FinAudioSum', tooltip: 'EN Audio: FinAudioSum (Text Generation)' },
|
| 1031 |
+
{ key: 'audio_average', label: 'Audio Avg', tooltip: 'Average performance on audio tasks' },
|
| 1032 |
+
{ key: 'modality_balanced', label: 'Modal. Avg', tooltip: 'Modality-Balanced Average (mean of text, vision, audio averages)' },
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1033 |
];
|
| 1034 |
|
| 1035 |
+
const evaluationColumns = EVAL_COLUMN_DEFS.map(({ key, label, tooltip }) => ({
|
| 1036 |
+
accessorKey: `evaluations.${key}`,
|
| 1037 |
+
header: createLeaderboardHeader(label, tooltip, null),
|
| 1038 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, `evaluations.${key}`),
|
| 1039 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
| 1040 |
+
meta: {
|
| 1041 |
+
cellStyle: () => ({
|
| 1042 |
+
position: "relative",
|
| 1043 |
+
overflow: "hidden",
|
| 1044 |
+
padding: "8px 16px",
|
| 1045 |
+
}),
|
| 1046 |
+
},
|
| 1047 |
+
}));
|
| 1048 |
+
|
| 1049 |
const optionalColumns = [
|
| 1050 |
{
|
| 1051 |
accessorKey: "model.architecture",
|