File size: 2,828 Bytes
de3e104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | {
"benchmark_info": {
"date": "2026-04-01",
"framework": "lm-evaluation-harness 0.4.9.2",
"inference": "llama.cpp (llama-server b8330)",
"hardware": "Apple M1 Max 32GB",
"quantization": "Q4_K_M",
"n_shot": 0,
"tasks": "KMMLU direct (10 subjects) + HAE-RAE (5 subtasks)",
"method": "generate_until with regex extraction"
},
"models": {
"vela-dpo-v6": {
"full_name": "VELA DPO v6 (Qwen2.5-7B + SFT + DPO v6)",
"file": "vela-dpo-v6-q4km.gguf",
"size_gb": 4.4
},
"qwen2.5-7b-instruct": {
"full_name": "Qwen2.5-7B-Instruct (baseline)",
"file": "qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
"size_gb": 4.4
},
"exaone-3.5-7.8b": {
"full_name": "EXAONE-3.5-7.8B-Instruct",
"file": "EXAONE-3.5-7.8B-Instruct-Q4_K_M.gguf",
"size_gb": 4.4
}
},
"kmmlu": {
"accounting": {
"vela_dpo_v6": 0.38,
"qwen25_7b": 0.33,
"exaone_35_7_8b": 0.42
},
"computer_science": {
"vela_dpo_v6": 0.737,
"qwen25_7b": 0.697,
"exaone_35_7_8b": 0.697
},
"economics": {
"vela_dpo_v6": 0.454,
"qwen25_7b": 0.477,
"exaone_35_7_8b": 0.515
},
"korean_history": {
"vela_dpo_v6": 0.31,
"qwen25_7b": 0.29,
"exaone_35_7_8b": 0.22
},
"law": {
"vela_dpo_v6": 0.434,
"qwen25_7b": 0.461,
"exaone_35_7_8b": 0.499
},
"management": {
"vela_dpo_v6": 0.54,
"qwen25_7b": 0.552,
"exaone_35_7_8b": 0.573
},
"marketing": {
"vela_dpo_v6": 0.757,
"qwen25_7b": 0.725,
"exaone_35_7_8b": 0.756
},
"math": {
"vela_dpo_v6": 0.33,
"qwen25_7b": 0.337,
"exaone_35_7_8b": 0.277
},
"political_science_and_sociology": {
"vela_dpo_v6": 0.49,
"qwen25_7b": 0.493,
"exaone_35_7_8b": 0.56
},
"psychology": {
"vela_dpo_v6": 0.392,
"qwen25_7b": 0.393,
"exaone_35_7_8b": 0.457
}
},
"haerae": {
"general_knowledge": {
"vela_dpo_v6": 0.4375,
"qwen25_7b": 0.4205,
"exaone_35_7_8b": 0.4432
},
"history": {
"vela_dpo_v6": 0.4574,
"qwen25_7b": 0.4255,
"exaone_35_7_8b": 0.7766
},
"loan_words": {
"vela_dpo_v6": 0.4852,
"qwen25_7b": 0.574,
"exaone_35_7_8b": 0.8107
},
"rare_words": {
"vela_dpo_v6": 0.6988,
"qwen25_7b": 0.684,
"exaone_35_7_8b": 0.7877
},
"standard_nomenclature": {
"vela_dpo_v6": 0.6471,
"qwen25_7b": 0.6601,
"exaone_35_7_8b": 0.719
}
},
"summary": {
"kmmlu_avg": {
"vela_dpo_v6": 0.482,
"qwen25_7b": 0.476,
"exaone_35_7_8b": 0.497
},
"haerae_avg": {
"vela_dpo_v6": 0.545,
"qwen25_7b": 0.553,
"exaone_35_7_8b": 0.707
}
}
} |