metadata base_model:
- Intelligent-Internet/II-Medical-8B
vllm (pretrained=/root/autodl-tmp/II-Medical-8B,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.800 |
± |
0.0253 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.876 |
± |
0.0209 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.796 |
± |
0.018 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.872 |
± |
0.015 |
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7216 |
± |
0.0144 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.7077 |
± |
0.0296 |
| - other |
2 |
none |
|
acc |
↑ |
0.7179 |
± |
0.0312 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8278 |
± |
0.0278 |
| - stem |
2 |
none |
|
acc |
↑ |
0.6667 |
± |
0.0263 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-70-256-4096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.720 |
± |
0.0285 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.816 |
± |
0.0246 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-64-4096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.756 |
± |
0.0272 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.848 |
± |
0.0228 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.764 |
± |
0.0269 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.852 |
± |
0.0225 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.764 |
± |
0.0269 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.864 |
± |
0.0217 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096-2,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.772 |
± |
0.0266 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.856 |
± |
0.0222 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096-2.7,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.756 |
± |
0.0272 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.868 |
± |
0.0215 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-3096-2.9,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.772 |
± |
0.0266 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.868 |
± |
0.0215 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-3096-3,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.744 |
± |
0.0277 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.836 |
± |
0.0235 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096-3,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.796 |
± |
0.0255 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.876 |
± |
0.0209 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096-3,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.794 |
± |
0.0181 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.866 |
± |
0.0152 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096-3,add_bos_token=true,max_model_len=3048,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7170 |
± |
0.0145 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.7128 |
± |
0.0293 |
| - other |
2 |
none |
|
acc |
↑ |
0.6923 |
± |
0.0323 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8111 |
± |
0.0284 |
| - stem |
2 |
none |
|
acc |
↑ |
0.6772 |
± |
0.0262 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-3096-3.1,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.796 |
± |
0.0255 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.884 |
± |
0.0203 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-3096-3.1,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.750 |
± |
0.0194 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.858 |
± |
0.0156 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-3096-3.1,add_bos_token=true,max_model_len=3048,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7205 |
± |
0.0144 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.7077 |
± |
0.0291 |
| - other |
2 |
none |
|
acc |
↑ |
0.7128 |
± |
0.0315 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8167 |
± |
0.0286 |
| - stem |
2 |
none |
|
acc |
↑ |
0.6737 |
± |
0.0262 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-128-4096-3.3,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.696 |
± |
0.0292 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.852 |
± |
0.0225 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-256-4096-3,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.776 |
± |
0.0264 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.864 |
± |
0.0217 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-80-256-4096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.712 |
± |
0.0287 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.844 |
± |
0.0230 |
vllm (pretrained=/root/autodl-tmp/II-Medical-8B-82-128-3096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.708 |
± |
0.0288 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.880 |
± |
0.0206 |