intrect commited on
Commit
de3e104
·
verified ·
1 Parent(s): 4043357

data: add raw benchmark results JSON (KMMLU + HAE-RAE, 3-model comparison)

Browse files
benchmarks/benchmark_comparison_20260401.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "benchmark_info": {
3
+ "date": "2026-04-01",
4
+ "framework": "lm-evaluation-harness 0.4.9.2",
5
+ "inference": "llama.cpp (llama-server b8330)",
6
+ "hardware": "Apple M1 Max 32GB",
7
+ "quantization": "Q4_K_M",
8
+ "n_shot": 0,
9
+ "tasks": "KMMLU direct (10 subjects) + HAE-RAE (5 subtasks)",
10
+ "method": "generate_until with regex extraction"
11
+ },
12
+ "models": {
13
+ "vela-dpo-v6": {
14
+ "full_name": "VELA DPO v6 (Qwen2.5-7B + SFT + DPO v6)",
15
+ "file": "vela-dpo-v6-q4km.gguf",
16
+ "size_gb": 4.4
17
+ },
18
+ "qwen2.5-7b-instruct": {
19
+ "full_name": "Qwen2.5-7B-Instruct (baseline)",
20
+ "file": "qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
21
+ "size_gb": 4.4
22
+ },
23
+ "exaone-3.5-7.8b": {
24
+ "full_name": "EXAONE-3.5-7.8B-Instruct",
25
+ "file": "EXAONE-3.5-7.8B-Instruct-Q4_K_M.gguf",
26
+ "size_gb": 4.4
27
+ }
28
+ },
29
+ "kmmlu": {
30
+ "accounting": {
31
+ "vela_dpo_v6": 0.38,
32
+ "qwen25_7b": 0.33,
33
+ "exaone_35_7_8b": 0.42
34
+ },
35
+ "computer_science": {
36
+ "vela_dpo_v6": 0.737,
37
+ "qwen25_7b": 0.697,
38
+ "exaone_35_7_8b": 0.697
39
+ },
40
+ "economics": {
41
+ "vela_dpo_v6": 0.454,
42
+ "qwen25_7b": 0.477,
43
+ "exaone_35_7_8b": 0.515
44
+ },
45
+ "korean_history": {
46
+ "vela_dpo_v6": 0.31,
47
+ "qwen25_7b": 0.29,
48
+ "exaone_35_7_8b": 0.22
49
+ },
50
+ "law": {
51
+ "vela_dpo_v6": 0.434,
52
+ "qwen25_7b": 0.461,
53
+ "exaone_35_7_8b": 0.499
54
+ },
55
+ "management": {
56
+ "vela_dpo_v6": 0.54,
57
+ "qwen25_7b": 0.552,
58
+ "exaone_35_7_8b": 0.573
59
+ },
60
+ "marketing": {
61
+ "vela_dpo_v6": 0.757,
62
+ "qwen25_7b": 0.725,
63
+ "exaone_35_7_8b": 0.756
64
+ },
65
+ "math": {
66
+ "vela_dpo_v6": 0.33,
67
+ "qwen25_7b": 0.337,
68
+ "exaone_35_7_8b": 0.277
69
+ },
70
+ "political_science_and_sociology": {
71
+ "vela_dpo_v6": 0.49,
72
+ "qwen25_7b": 0.493,
73
+ "exaone_35_7_8b": 0.56
74
+ },
75
+ "psychology": {
76
+ "vela_dpo_v6": 0.392,
77
+ "qwen25_7b": 0.393,
78
+ "exaone_35_7_8b": 0.457
79
+ }
80
+ },
81
+ "haerae": {
82
+ "general_knowledge": {
83
+ "vela_dpo_v6": 0.4375,
84
+ "qwen25_7b": 0.4205,
85
+ "exaone_35_7_8b": 0.4432
86
+ },
87
+ "history": {
88
+ "vela_dpo_v6": 0.4574,
89
+ "qwen25_7b": 0.4255,
90
+ "exaone_35_7_8b": 0.7766
91
+ },
92
+ "loan_words": {
93
+ "vela_dpo_v6": 0.4852,
94
+ "qwen25_7b": 0.574,
95
+ "exaone_35_7_8b": 0.8107
96
+ },
97
+ "rare_words": {
98
+ "vela_dpo_v6": 0.6988,
99
+ "qwen25_7b": 0.684,
100
+ "exaone_35_7_8b": 0.7877
101
+ },
102
+ "standard_nomenclature": {
103
+ "vela_dpo_v6": 0.6471,
104
+ "qwen25_7b": 0.6601,
105
+ "exaone_35_7_8b": 0.719
106
+ }
107
+ },
108
+ "summary": {
109
+ "kmmlu_avg": {
110
+ "vela_dpo_v6": 0.482,
111
+ "qwen25_7b": 0.476,
112
+ "exaone_35_7_8b": 0.497
113
+ },
114
+ "haerae_avg": {
115
+ "vela_dpo_v6": 0.545,
116
+ "qwen25_7b": 0.553,
117
+ "exaone_35_7_8b": 0.707
118
+ }
119
+ }
120
+ }