opencompass / .github /scripts /oc_score_baseline.yaml
msj19's picture
Add files using upload-large-folder tool
9b40ad5 verified
qwen2.5-7b-hf:
demo_gsm8k_accuracy: 78.12
race-middle_accuracy: 90.46
race-high_accuracy: 86.54
internlm3-8b-instruct-lmdeploy:
demo_gsm8k_accuracy: 75.00
race-middle_accuracy: 93.31
race-high_accuracy: 90.28
internlm3-8b-instruct_hf-lmdeploy:
demo_gsm8k_accuracy: 73.44
race-middle_accuracy: 93.38
race-high_accuracy: 90.34
Qwen2.5-7B_hf:
demo_gsm8k_accuracy: 78.12
race-middle_accuracy: 90.46
race-high_accuracy: 86.54
Qwen3-0.6B_hf-vllm:
demo_gsm8k_accuracy: 53.12
race-middle_accuracy: 38.23
race-high_accuracy: 28.07
lmdeploy-api-test:
IFEval_Prompt-level-strict-accuracy: 81.25
hle_llmjudge_accuracy: 75.00
mmlu_pro_math_accuracy: 18.75
mmlu_pro_other_accuracy: 25.00
race-middle_accuracy: 87.50
race-high_accuracy: 81.25
gsm8k_accuracy: 18.75
lmdeploy-api-streaming-test:
IFEval_Prompt-level-strict-accuracy: 68.75
hle_llmjudge_accuracy: 81.25
mmlu_pro_math_accuracy: 18.75
mmlu_pro_other_accuracy: 31.25
race-middle_accuracy: 81.25
race-high_accuracy: 75.00
gsm8k_accuracy: 25.00
lmdeploy-api-streaming-test-chunk:
IFEval_Prompt-level-strict-accuracy: 81.25
hle_llmjudge_accuracy: 75
mmlu_pro_math_accuracy: 18.75
mmlu_pro_other_accuracy: 37.50
race-middle_accuracy: 75
race-high_accuracy: 81.25
gsm8k_accuracy: 37.5
lmdeploy-api-test-maxlen:
IFEval_Prompt-level-strict-accuracy: 93.75
hle_llmjudge_accuracy: 56.25
mmlu_pro_math_accuracy: 62.50
mmlu_pro_other_accuracy: 43.75
race-middle_accuracy: 81.25
race-high_accuracy: 81.25
gsm8k_accuracy: 31.25
lmdeploy-api-test-maxlen-mid:
IFEval_Prompt-level-strict-accuracy: 12.50
hle_llmjudge_accuracy: 81.25
mmlu_pro_math_accuracy: 0
mmlu_pro_other_accuracy: 0
race-middle_accuracy: 18.75
race-high_accuracy: 12.50
gsm8k_accuracy: 43.75
lmdeploy-api-test-nothink:
IFEval_Prompt-level-strict-accuracy: 93.75
hle_llmjudge_accuracy: 62.50
mmlu_pro_math_accuracy: 62.50
mmlu_pro_other_accuracy: 50.00
race-middle_accuracy: 87.50
race-high_accuracy: 87.50
gsm8k_accuracy: 31.25
lmdeploy-api-test-chat-template:
IFEval_Prompt-level-strict-accuracy: 68.75
hle_llmjudge_accuracy: 62.50
mmlu_pro_math_accuracy: 18.75
mmlu_pro_other_accuracy: 6.25
race-middle_accuracy: 81.25
race-high_accuracy: 68.75