VELA / benchmarks /benchmark_comparison_20260401.json
intrect's picture
data: add raw benchmark results JSON (KMMLU + HAE-RAE, 3-model comparison)
de3e104 verified
{
"benchmark_info": {
"date": "2026-04-01",
"framework": "lm-evaluation-harness 0.4.9.2",
"inference": "llama.cpp (llama-server b8330)",
"hardware": "Apple M1 Max 32GB",
"quantization": "Q4_K_M",
"n_shot": 0,
"tasks": "KMMLU direct (10 subjects) + HAE-RAE (5 subtasks)",
"method": "generate_until with regex extraction"
},
"models": {
"vela-dpo-v6": {
"full_name": "VELA DPO v6 (Qwen2.5-7B + SFT + DPO v6)",
"file": "vela-dpo-v6-q4km.gguf",
"size_gb": 4.4
},
"qwen2.5-7b-instruct": {
"full_name": "Qwen2.5-7B-Instruct (baseline)",
"file": "qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
"size_gb": 4.4
},
"exaone-3.5-7.8b": {
"full_name": "EXAONE-3.5-7.8B-Instruct",
"file": "EXAONE-3.5-7.8B-Instruct-Q4_K_M.gguf",
"size_gb": 4.4
}
},
"kmmlu": {
"accounting": {
"vela_dpo_v6": 0.38,
"qwen25_7b": 0.33,
"exaone_35_7_8b": 0.42
},
"computer_science": {
"vela_dpo_v6": 0.737,
"qwen25_7b": 0.697,
"exaone_35_7_8b": 0.697
},
"economics": {
"vela_dpo_v6": 0.454,
"qwen25_7b": 0.477,
"exaone_35_7_8b": 0.515
},
"korean_history": {
"vela_dpo_v6": 0.31,
"qwen25_7b": 0.29,
"exaone_35_7_8b": 0.22
},
"law": {
"vela_dpo_v6": 0.434,
"qwen25_7b": 0.461,
"exaone_35_7_8b": 0.499
},
"management": {
"vela_dpo_v6": 0.54,
"qwen25_7b": 0.552,
"exaone_35_7_8b": 0.573
},
"marketing": {
"vela_dpo_v6": 0.757,
"qwen25_7b": 0.725,
"exaone_35_7_8b": 0.756
},
"math": {
"vela_dpo_v6": 0.33,
"qwen25_7b": 0.337,
"exaone_35_7_8b": 0.277
},
"political_science_and_sociology": {
"vela_dpo_v6": 0.49,
"qwen25_7b": 0.493,
"exaone_35_7_8b": 0.56
},
"psychology": {
"vela_dpo_v6": 0.392,
"qwen25_7b": 0.393,
"exaone_35_7_8b": 0.457
}
},
"haerae": {
"general_knowledge": {
"vela_dpo_v6": 0.4375,
"qwen25_7b": 0.4205,
"exaone_35_7_8b": 0.4432
},
"history": {
"vela_dpo_v6": 0.4574,
"qwen25_7b": 0.4255,
"exaone_35_7_8b": 0.7766
},
"loan_words": {
"vela_dpo_v6": 0.4852,
"qwen25_7b": 0.574,
"exaone_35_7_8b": 0.8107
},
"rare_words": {
"vela_dpo_v6": 0.6988,
"qwen25_7b": 0.684,
"exaone_35_7_8b": 0.7877
},
"standard_nomenclature": {
"vela_dpo_v6": 0.6471,
"qwen25_7b": 0.6601,
"exaone_35_7_8b": 0.719
}
},
"summary": {
"kmmlu_avg": {
"vela_dpo_v6": 0.482,
"qwen25_7b": 0.476,
"exaone_35_7_8b": 0.497
},
"haerae_avg": {
"vela_dpo_v6": 0.545,
"qwen25_7b": 0.553,
"exaone_35_7_8b": 0.707
}
}
}