File size: 2,828 Bytes
de3e104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
{
  "benchmark_info": {
    "date": "2026-04-01",
    "framework": "lm-evaluation-harness 0.4.9.2",
    "inference": "llama.cpp (llama-server b8330)",
    "hardware": "Apple M1 Max 32GB",
    "quantization": "Q4_K_M",
    "n_shot": 0,
    "tasks": "KMMLU direct (10 subjects) + HAE-RAE (5 subtasks)",
    "method": "generate_until with regex extraction"
  },
  "models": {
    "vela-dpo-v6": {
      "full_name": "VELA DPO v6 (Qwen2.5-7B + SFT + DPO v6)",
      "file": "vela-dpo-v6-q4km.gguf",
      "size_gb": 4.4
    },
    "qwen2.5-7b-instruct": {
      "full_name": "Qwen2.5-7B-Instruct (baseline)",
      "file": "qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
      "size_gb": 4.4
    },
    "exaone-3.5-7.8b": {
      "full_name": "EXAONE-3.5-7.8B-Instruct",
      "file": "EXAONE-3.5-7.8B-Instruct-Q4_K_M.gguf",
      "size_gb": 4.4
    }
  },
  "kmmlu": {
    "accounting": {
      "vela_dpo_v6": 0.38,
      "qwen25_7b": 0.33,
      "exaone_35_7_8b": 0.42
    },
    "computer_science": {
      "vela_dpo_v6": 0.737,
      "qwen25_7b": 0.697,
      "exaone_35_7_8b": 0.697
    },
    "economics": {
      "vela_dpo_v6": 0.454,
      "qwen25_7b": 0.477,
      "exaone_35_7_8b": 0.515
    },
    "korean_history": {
      "vela_dpo_v6": 0.31,
      "qwen25_7b": 0.29,
      "exaone_35_7_8b": 0.22
    },
    "law": {
      "vela_dpo_v6": 0.434,
      "qwen25_7b": 0.461,
      "exaone_35_7_8b": 0.499
    },
    "management": {
      "vela_dpo_v6": 0.54,
      "qwen25_7b": 0.552,
      "exaone_35_7_8b": 0.573
    },
    "marketing": {
      "vela_dpo_v6": 0.757,
      "qwen25_7b": 0.725,
      "exaone_35_7_8b": 0.756
    },
    "math": {
      "vela_dpo_v6": 0.33,
      "qwen25_7b": 0.337,
      "exaone_35_7_8b": 0.277
    },
    "political_science_and_sociology": {
      "vela_dpo_v6": 0.49,
      "qwen25_7b": 0.493,
      "exaone_35_7_8b": 0.56
    },
    "psychology": {
      "vela_dpo_v6": 0.392,
      "qwen25_7b": 0.393,
      "exaone_35_7_8b": 0.457
    }
  },
  "haerae": {
    "general_knowledge": {
      "vela_dpo_v6": 0.4375,
      "qwen25_7b": 0.4205,
      "exaone_35_7_8b": 0.4432
    },
    "history": {
      "vela_dpo_v6": 0.4574,
      "qwen25_7b": 0.4255,
      "exaone_35_7_8b": 0.7766
    },
    "loan_words": {
      "vela_dpo_v6": 0.4852,
      "qwen25_7b": 0.574,
      "exaone_35_7_8b": 0.8107
    },
    "rare_words": {
      "vela_dpo_v6": 0.6988,
      "qwen25_7b": 0.684,
      "exaone_35_7_8b": 0.7877
    },
    "standard_nomenclature": {
      "vela_dpo_v6": 0.6471,
      "qwen25_7b": 0.6601,
      "exaone_35_7_8b": 0.719
    }
  },
  "summary": {
    "kmmlu_avg": {
      "vela_dpo_v6": 0.482,
      "qwen25_7b": 0.476,
      "exaone_35_7_8b": 0.497
    },
    "haerae_avg": {
      "vela_dpo_v6": 0.545,
      "qwen25_7b": 0.553,
      "exaone_35_7_8b": 0.707
    }
  }
}